{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "993297be",
   "metadata": {},
   "source": [
    "# 第3章 金融数据获取及预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "fafa812d",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:12:24.560201Z",
     "start_time": "2024-06-05T02:12:24.556634Z"
    }
   },
   "outputs": [],
   "source": [
    "import  warnings\n",
    "import akshare as ak\n",
    "import baostock as bs\n",
    "import pandas as pd\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "766cb7e0",
   "metadata": {},
   "source": [
    "## 3.1金融数据获取\n",
    "### 3.1.1第三方数据接口"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1f314757",
   "metadata": {},
   "source": [
    "#### （1） akshare"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "7bc45804",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:12:33.056195Z",
     "start_time": "2024-06-05T02:12:26.855349Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>日期</th>\n",
       "      <th>股票代码</th>\n",
       "      <th>开盘</th>\n",
       "      <th>收盘</th>\n",
       "      <th>最高</th>\n",
       "      <th>最低</th>\n",
       "      <th>成交量</th>\n",
       "      <th>成交额</th>\n",
       "      <th>振幅</th>\n",
       "      <th>涨跌幅</th>\n",
       "      <th>涨跌额</th>\n",
       "      <th>换手率</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2024-01-02</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.76</td>\n",
       "      <td>5.78</td>\n",
       "      <td>5.82</td>\n",
       "      <td>5.74</td>\n",
       "      <td>1252778</td>\n",
       "      <td>7.254418e+08</td>\n",
       "      <td>1.39</td>\n",
       "      <td>0.70</td>\n",
       "      <td>0.04</td>\n",
       "      <td>0.32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2024-01-03</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.78</td>\n",
       "      <td>5.85</td>\n",
       "      <td>5.86</td>\n",
       "      <td>5.77</td>\n",
       "      <td>1564273</td>\n",
       "      <td>9.126625e+08</td>\n",
       "      <td>1.56</td>\n",
       "      <td>1.21</td>\n",
       "      <td>0.07</td>\n",
       "      <td>0.40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2024-01-04</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.84</td>\n",
       "      <td>5.90</td>\n",
       "      <td>5.90</td>\n",
       "      <td>5.84</td>\n",
       "      <td>1263685</td>\n",
       "      <td>7.419976e+08</td>\n",
       "      <td>1.03</td>\n",
       "      <td>0.85</td>\n",
       "      <td>0.05</td>\n",
       "      <td>0.32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2024-01-05</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.89</td>\n",
       "      <td>5.91</td>\n",
       "      <td>5.98</td>\n",
       "      <td>5.88</td>\n",
       "      <td>1362116</td>\n",
       "      <td>8.082849e+08</td>\n",
       "      <td>1.69</td>\n",
       "      <td>0.17</td>\n",
       "      <td>0.01</td>\n",
       "      <td>0.35</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2024-01-08</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.90</td>\n",
       "      <td>5.89</td>\n",
       "      <td>5.95</td>\n",
       "      <td>5.85</td>\n",
       "      <td>1078442</td>\n",
       "      <td>6.353274e+08</td>\n",
       "      <td>1.69</td>\n",
       "      <td>-0.34</td>\n",
       "      <td>-0.02</td>\n",
       "      <td>0.27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2024-01-09</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.87</td>\n",
       "      <td>5.88</td>\n",
       "      <td>5.90</td>\n",
       "      <td>5.81</td>\n",
       "      <td>984831</td>\n",
       "      <td>5.767000e+08</td>\n",
       "      <td>1.53</td>\n",
       "      <td>-0.17</td>\n",
       "      <td>-0.01</td>\n",
       "      <td>0.25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2024-01-10</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.87</td>\n",
       "      <td>5.84</td>\n",
       "      <td>5.93</td>\n",
       "      <td>5.84</td>\n",
       "      <td>790095</td>\n",
       "      <td>4.636102e+08</td>\n",
       "      <td>1.53</td>\n",
       "      <td>-0.68</td>\n",
       "      <td>-0.04</td>\n",
       "      <td>0.20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2024-01-11</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.84</td>\n",
       "      <td>5.75</td>\n",
       "      <td>5.84</td>\n",
       "      <td>5.75</td>\n",
       "      <td>1429723</td>\n",
       "      <td>8.284045e+08</td>\n",
       "      <td>1.54</td>\n",
       "      <td>-1.54</td>\n",
       "      <td>-0.09</td>\n",
       "      <td>0.36</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2024-01-12</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.75</td>\n",
       "      <td>5.81</td>\n",
       "      <td>5.83</td>\n",
       "      <td>5.75</td>\n",
       "      <td>1051751</td>\n",
       "      <td>6.100583e+08</td>\n",
       "      <td>1.39</td>\n",
       "      <td>1.04</td>\n",
       "      <td>0.06</td>\n",
       "      <td>0.27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2024-01-15</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.82</td>\n",
       "      <td>5.83</td>\n",
       "      <td>5.84</td>\n",
       "      <td>5.78</td>\n",
       "      <td>911631</td>\n",
       "      <td>5.304019e+08</td>\n",
       "      <td>1.03</td>\n",
       "      <td>0.34</td>\n",
       "      <td>0.02</td>\n",
       "      <td>0.23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>2024-01-16</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.81</td>\n",
       "      <td>5.85</td>\n",
       "      <td>5.87</td>\n",
       "      <td>5.80</td>\n",
       "      <td>1202292</td>\n",
       "      <td>7.022116e+08</td>\n",
       "      <td>1.20</td>\n",
       "      <td>0.34</td>\n",
       "      <td>0.02</td>\n",
       "      <td>0.31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>2024-01-17</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.85</td>\n",
       "      <td>5.80</td>\n",
       "      <td>5.89</td>\n",
       "      <td>5.80</td>\n",
       "      <td>1500743</td>\n",
       "      <td>8.763330e+08</td>\n",
       "      <td>1.54</td>\n",
       "      <td>-0.85</td>\n",
       "      <td>-0.05</td>\n",
       "      <td>0.38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>2024-01-18</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.79</td>\n",
       "      <td>5.79</td>\n",
       "      <td>5.82</td>\n",
       "      <td>5.62</td>\n",
       "      <td>2724694</td>\n",
       "      <td>1.556634e+09</td>\n",
       "      <td>3.45</td>\n",
       "      <td>-0.17</td>\n",
       "      <td>-0.01</td>\n",
       "      <td>0.69</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>2024-01-19</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.76</td>\n",
       "      <td>5.77</td>\n",
       "      <td>5.79</td>\n",
       "      <td>5.68</td>\n",
       "      <td>1476814</td>\n",
       "      <td>8.484675e+08</td>\n",
       "      <td>1.90</td>\n",
       "      <td>-0.35</td>\n",
       "      <td>-0.02</td>\n",
       "      <td>0.38</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>2024-01-22</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.77</td>\n",
       "      <td>5.77</td>\n",
       "      <td>5.82</td>\n",
       "      <td>5.73</td>\n",
       "      <td>2199734</td>\n",
       "      <td>1.273837e+09</td>\n",
       "      <td>1.56</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.56</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>2024-01-23</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.76</td>\n",
       "      <td>5.75</td>\n",
       "      <td>5.78</td>\n",
       "      <td>5.72</td>\n",
       "      <td>1174640</td>\n",
       "      <td>6.747546e+08</td>\n",
       "      <td>1.04</td>\n",
       "      <td>-0.35</td>\n",
       "      <td>-0.02</td>\n",
       "      <td>0.30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>2024-01-24</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.76</td>\n",
       "      <td>5.84</td>\n",
       "      <td>5.86</td>\n",
       "      <td>5.75</td>\n",
       "      <td>1832576</td>\n",
       "      <td>1.064420e+09</td>\n",
       "      <td>1.91</td>\n",
       "      <td>1.57</td>\n",
       "      <td>0.09</td>\n",
       "      <td>0.47</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>2024-01-25</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.87</td>\n",
       "      <td>5.90</td>\n",
       "      <td>5.92</td>\n",
       "      <td>5.84</td>\n",
       "      <td>1792120</td>\n",
       "      <td>1.054502e+09</td>\n",
       "      <td>1.37</td>\n",
       "      <td>1.03</td>\n",
       "      <td>0.06</td>\n",
       "      <td>0.46</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>2024-01-26</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.89</td>\n",
       "      <td>5.97</td>\n",
       "      <td>5.98</td>\n",
       "      <td>5.87</td>\n",
       "      <td>2093862</td>\n",
       "      <td>1.242152e+09</td>\n",
       "      <td>1.86</td>\n",
       "      <td>1.19</td>\n",
       "      <td>0.07</td>\n",
       "      <td>0.53</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>2024-01-29</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.99</td>\n",
       "      <td>6.04</td>\n",
       "      <td>6.09</td>\n",
       "      <td>5.98</td>\n",
       "      <td>1888490</td>\n",
       "      <td>1.139530e+09</td>\n",
       "      <td>1.84</td>\n",
       "      <td>1.17</td>\n",
       "      <td>0.07</td>\n",
       "      <td>0.48</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>2024-01-30</td>\n",
       "      <td>601328</td>\n",
       "      <td>6.03</td>\n",
       "      <td>5.99</td>\n",
       "      <td>6.07</td>\n",
       "      <td>5.95</td>\n",
       "      <td>1335791</td>\n",
       "      <td>8.034151e+08</td>\n",
       "      <td>1.99</td>\n",
       "      <td>-0.83</td>\n",
       "      <td>-0.05</td>\n",
       "      <td>0.34</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>2024-01-31</td>\n",
       "      <td>601328</td>\n",
       "      <td>5.98</td>\n",
       "      <td>6.00</td>\n",
       "      <td>6.03</td>\n",
       "      <td>5.94</td>\n",
       "      <td>1446518</td>\n",
       "      <td>8.662343e+08</td>\n",
       "      <td>1.50</td>\n",
       "      <td>0.17</td>\n",
       "      <td>0.01</td>\n",
       "      <td>0.37</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            日期    股票代码    开盘    收盘    最高    最低      成交量           成交额    振幅  \\\n",
       "0   2024-01-02  601328  5.76  5.78  5.82  5.74  1252778  7.254418e+08  1.39   \n",
       "1   2024-01-03  601328  5.78  5.85  5.86  5.77  1564273  9.126625e+08  1.56   \n",
       "2   2024-01-04  601328  5.84  5.90  5.90  5.84  1263685  7.419976e+08  1.03   \n",
       "3   2024-01-05  601328  5.89  5.91  5.98  5.88  1362116  8.082849e+08  1.69   \n",
       "4   2024-01-08  601328  5.90  5.89  5.95  5.85  1078442  6.353274e+08  1.69   \n",
       "5   2024-01-09  601328  5.87  5.88  5.90  5.81   984831  5.767000e+08  1.53   \n",
       "6   2024-01-10  601328  5.87  5.84  5.93  5.84   790095  4.636102e+08  1.53   \n",
       "7   2024-01-11  601328  5.84  5.75  5.84  5.75  1429723  8.284045e+08  1.54   \n",
       "8   2024-01-12  601328  5.75  5.81  5.83  5.75  1051751  6.100583e+08  1.39   \n",
       "9   2024-01-15  601328  5.82  5.83  5.84  5.78   911631  5.304019e+08  1.03   \n",
       "10  2024-01-16  601328  5.81  5.85  5.87  5.80  1202292  7.022116e+08  1.20   \n",
       "11  2024-01-17  601328  5.85  5.80  5.89  5.80  1500743  8.763330e+08  1.54   \n",
       "12  2024-01-18  601328  5.79  5.79  5.82  5.62  2724694  1.556634e+09  3.45   \n",
       "13  2024-01-19  601328  5.76  5.77  5.79  5.68  1476814  8.484675e+08  1.90   \n",
       "14  2024-01-22  601328  5.77  5.77  5.82  5.73  2199734  1.273837e+09  1.56   \n",
       "15  2024-01-23  601328  5.76  5.75  5.78  5.72  1174640  6.747546e+08  1.04   \n",
       "16  2024-01-24  601328  5.76  5.84  5.86  5.75  1832576  1.064420e+09  1.91   \n",
       "17  2024-01-25  601328  5.87  5.90  5.92  5.84  1792120  1.054502e+09  1.37   \n",
       "18  2024-01-26  601328  5.89  5.97  5.98  5.87  2093862  1.242152e+09  1.86   \n",
       "19  2024-01-29  601328  5.99  6.04  6.09  5.98  1888490  1.139530e+09  1.84   \n",
       "20  2024-01-30  601328  6.03  5.99  6.07  5.95  1335791  8.034151e+08  1.99   \n",
       "21  2024-01-31  601328  5.98  6.00  6.03  5.94  1446518  8.662343e+08  1.50   \n",
       "\n",
       "     涨跌幅   涨跌额   换手率  \n",
       "0   0.70  0.04  0.32  \n",
       "1   1.21  0.07  0.40  \n",
       "2   0.85  0.05  0.32  \n",
       "3   0.17  0.01  0.35  \n",
       "4  -0.34 -0.02  0.27  \n",
       "5  -0.17 -0.01  0.25  \n",
       "6  -0.68 -0.04  0.20  \n",
       "7  -1.54 -0.09  0.36  \n",
       "8   1.04  0.06  0.27  \n",
       "9   0.34  0.02  0.23  \n",
       "10  0.34  0.02  0.31  \n",
       "11 -0.85 -0.05  0.38  \n",
       "12 -0.17 -0.01  0.69  \n",
       "13 -0.35 -0.02  0.38  \n",
       "14  0.00  0.00  0.56  \n",
       "15 -0.35 -0.02  0.30  \n",
       "16  1.57  0.09  0.47  \n",
       "17  1.03  0.06  0.46  \n",
       "18  1.19  0.07  0.53  \n",
       "19  1.17  0.07  0.48  \n",
       "20 -0.83 -0.05  0.34  \n",
       "21  0.17  0.01  0.37  "
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#只需首次安装\n",
    "#!pip install akshare -i https://pypi.tuna.tsinghua.edu.cn/simple\n",
    "import akshare as ak\n",
    "result1=ak.stock_zh_a_hist(symbol='601328',start_date='20240101',end_date='20240131') \n",
    "result1.to_excel('AKShare接口交通银行股票2024年1月每日历史行情数据.xlsx')\n",
    "result1 #输出结果"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f718ed85",
   "metadata": {},
   "source": [
    "#### （2）baostock"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "bef506ba",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "login success!\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>code</th>\n",
       "      <th>open</th>\n",
       "      <th>high</th>\n",
       "      <th>low</th>\n",
       "      <th>close</th>\n",
       "      <th>preclose</th>\n",
       "      <th>volume</th>\n",
       "      <th>amount</th>\n",
       "      <th>adjustflag</th>\n",
       "      <th>turn</th>\n",
       "      <th>tradestatus</th>\n",
       "      <th>pctChg</th>\n",
       "      <th>isST</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2024-01-02</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.7600</td>\n",
       "      <td>5.8200</td>\n",
       "      <td>5.7400</td>\n",
       "      <td>5.7800</td>\n",
       "      <td>5.7400</td>\n",
       "      <td>125277792</td>\n",
       "      <td>725441819.4300</td>\n",
       "      <td>3</td>\n",
       "      <td>0.319200</td>\n",
       "      <td>1</td>\n",
       "      <td>0.696900</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2024-01-03</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.7800</td>\n",
       "      <td>5.8600</td>\n",
       "      <td>5.7700</td>\n",
       "      <td>5.8500</td>\n",
       "      <td>5.7800</td>\n",
       "      <td>156427305</td>\n",
       "      <td>912662450.7600</td>\n",
       "      <td>3</td>\n",
       "      <td>0.398500</td>\n",
       "      <td>1</td>\n",
       "      <td>1.211100</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2024-01-04</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.8400</td>\n",
       "      <td>5.9000</td>\n",
       "      <td>5.8400</td>\n",
       "      <td>5.9000</td>\n",
       "      <td>5.8500</td>\n",
       "      <td>126368459</td>\n",
       "      <td>741997555.2300</td>\n",
       "      <td>3</td>\n",
       "      <td>0.322000</td>\n",
       "      <td>1</td>\n",
       "      <td>0.854700</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2024-01-05</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.8900</td>\n",
       "      <td>5.9800</td>\n",
       "      <td>5.8800</td>\n",
       "      <td>5.9100</td>\n",
       "      <td>5.9000</td>\n",
       "      <td>136211574</td>\n",
       "      <td>808284866.5700</td>\n",
       "      <td>3</td>\n",
       "      <td>0.347000</td>\n",
       "      <td>1</td>\n",
       "      <td>0.169500</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2024-01-08</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.9000</td>\n",
       "      <td>5.9500</td>\n",
       "      <td>5.8500</td>\n",
       "      <td>5.8900</td>\n",
       "      <td>5.9100</td>\n",
       "      <td>107844154</td>\n",
       "      <td>635327397.4700</td>\n",
       "      <td>3</td>\n",
       "      <td>0.274800</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.338400</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>2024-01-09</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.8700</td>\n",
       "      <td>5.9000</td>\n",
       "      <td>5.8100</td>\n",
       "      <td>5.8800</td>\n",
       "      <td>5.8900</td>\n",
       "      <td>98483058</td>\n",
       "      <td>576699973.4400</td>\n",
       "      <td>3</td>\n",
       "      <td>0.250900</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.169800</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2024-01-10</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.8700</td>\n",
       "      <td>5.9300</td>\n",
       "      <td>5.8400</td>\n",
       "      <td>5.8400</td>\n",
       "      <td>5.8800</td>\n",
       "      <td>79009451</td>\n",
       "      <td>463610158.3300</td>\n",
       "      <td>3</td>\n",
       "      <td>0.201300</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.680300</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2024-01-11</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.8400</td>\n",
       "      <td>5.8400</td>\n",
       "      <td>5.7500</td>\n",
       "      <td>5.7500</td>\n",
       "      <td>5.8400</td>\n",
       "      <td>142972319</td>\n",
       "      <td>828404502.6400</td>\n",
       "      <td>3</td>\n",
       "      <td>0.364300</td>\n",
       "      <td>1</td>\n",
       "      <td>-1.541100</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>2024-01-12</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.7500</td>\n",
       "      <td>5.8300</td>\n",
       "      <td>5.7500</td>\n",
       "      <td>5.8100</td>\n",
       "      <td>5.7500</td>\n",
       "      <td>105175081</td>\n",
       "      <td>610058254.6600</td>\n",
       "      <td>3</td>\n",
       "      <td>0.268000</td>\n",
       "      <td>1</td>\n",
       "      <td>1.043500</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2024-01-15</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.8200</td>\n",
       "      <td>5.8400</td>\n",
       "      <td>5.7800</td>\n",
       "      <td>5.8300</td>\n",
       "      <td>5.8100</td>\n",
       "      <td>91163069</td>\n",
       "      <td>530401927.9500</td>\n",
       "      <td>3</td>\n",
       "      <td>0.232300</td>\n",
       "      <td>1</td>\n",
       "      <td>0.344200</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>2024-01-16</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.8100</td>\n",
       "      <td>5.8700</td>\n",
       "      <td>5.8000</td>\n",
       "      <td>5.8500</td>\n",
       "      <td>5.8300</td>\n",
       "      <td>120229236</td>\n",
       "      <td>702211617.2200</td>\n",
       "      <td>3</td>\n",
       "      <td>0.306300</td>\n",
       "      <td>1</td>\n",
       "      <td>0.343100</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>2024-01-17</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.8500</td>\n",
       "      <td>5.8900</td>\n",
       "      <td>5.8000</td>\n",
       "      <td>5.8000</td>\n",
       "      <td>5.8500</td>\n",
       "      <td>150074308</td>\n",
       "      <td>876332955.7200</td>\n",
       "      <td>3</td>\n",
       "      <td>0.382300</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.854700</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>2024-01-18</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.7900</td>\n",
       "      <td>5.8200</td>\n",
       "      <td>5.6200</td>\n",
       "      <td>5.7900</td>\n",
       "      <td>5.8000</td>\n",
       "      <td>272469361</td>\n",
       "      <td>1556634115.9200</td>\n",
       "      <td>3</td>\n",
       "      <td>0.694200</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.172400</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>2024-01-19</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.7600</td>\n",
       "      <td>5.7900</td>\n",
       "      <td>5.6800</td>\n",
       "      <td>5.7700</td>\n",
       "      <td>5.7900</td>\n",
       "      <td>147681383</td>\n",
       "      <td>848467507.6300</td>\n",
       "      <td>3</td>\n",
       "      <td>0.376300</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.345400</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>2024-01-22</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.7700</td>\n",
       "      <td>5.8200</td>\n",
       "      <td>5.7300</td>\n",
       "      <td>5.7700</td>\n",
       "      <td>5.7700</td>\n",
       "      <td>219973362</td>\n",
       "      <td>1273837167.0600</td>\n",
       "      <td>3</td>\n",
       "      <td>0.560400</td>\n",
       "      <td>1</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>2024-01-23</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.7600</td>\n",
       "      <td>5.7800</td>\n",
       "      <td>5.7200</td>\n",
       "      <td>5.7500</td>\n",
       "      <td>5.7700</td>\n",
       "      <td>117464032</td>\n",
       "      <td>674754623.0900</td>\n",
       "      <td>3</td>\n",
       "      <td>0.299300</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.346600</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>2024-01-24</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.7600</td>\n",
       "      <td>5.8600</td>\n",
       "      <td>5.7500</td>\n",
       "      <td>5.8400</td>\n",
       "      <td>5.7500</td>\n",
       "      <td>183257551</td>\n",
       "      <td>1064419708.0200</td>\n",
       "      <td>3</td>\n",
       "      <td>0.466900</td>\n",
       "      <td>1</td>\n",
       "      <td>1.565200</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>2024-01-25</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.8700</td>\n",
       "      <td>5.9200</td>\n",
       "      <td>5.8400</td>\n",
       "      <td>5.9000</td>\n",
       "      <td>5.8400</td>\n",
       "      <td>179212006</td>\n",
       "      <td>1054502353.8200</td>\n",
       "      <td>3</td>\n",
       "      <td>0.456600</td>\n",
       "      <td>1</td>\n",
       "      <td>1.027400</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>2024-01-26</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.8900</td>\n",
       "      <td>5.9800</td>\n",
       "      <td>5.8700</td>\n",
       "      <td>5.9700</td>\n",
       "      <td>5.9000</td>\n",
       "      <td>209386150</td>\n",
       "      <td>1242152043.7000</td>\n",
       "      <td>3</td>\n",
       "      <td>0.533500</td>\n",
       "      <td>1</td>\n",
       "      <td>1.186400</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>2024-01-29</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.9900</td>\n",
       "      <td>6.0900</td>\n",
       "      <td>5.9800</td>\n",
       "      <td>6.0400</td>\n",
       "      <td>5.9700</td>\n",
       "      <td>188849038</td>\n",
       "      <td>1139529518.8600</td>\n",
       "      <td>3</td>\n",
       "      <td>0.481100</td>\n",
       "      <td>1</td>\n",
       "      <td>1.172500</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>2024-01-30</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>6.0300</td>\n",
       "      <td>6.0700</td>\n",
       "      <td>5.9500</td>\n",
       "      <td>5.9900</td>\n",
       "      <td>6.0400</td>\n",
       "      <td>133579069</td>\n",
       "      <td>803415134.2600</td>\n",
       "      <td>3</td>\n",
       "      <td>0.340300</td>\n",
       "      <td>1</td>\n",
       "      <td>-0.827800</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>2024-01-31</td>\n",
       "      <td>sh.601328</td>\n",
       "      <td>5.9800</td>\n",
       "      <td>6.0300</td>\n",
       "      <td>5.9400</td>\n",
       "      <td>6.0000</td>\n",
       "      <td>5.9900</td>\n",
       "      <td>144651806</td>\n",
       "      <td>866234311.1000</td>\n",
       "      <td>3</td>\n",
       "      <td>0.368500</td>\n",
       "      <td>1</td>\n",
       "      <td>0.166900</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          date       code    open    high     low   close preclose     volume  \\\n",
       "0   2024-01-02  sh.601328  5.7600  5.8200  5.7400  5.7800   5.7400  125277792   \n",
       "1   2024-01-03  sh.601328  5.7800  5.8600  5.7700  5.8500   5.7800  156427305   \n",
       "2   2024-01-04  sh.601328  5.8400  5.9000  5.8400  5.9000   5.8500  126368459   \n",
       "3   2024-01-05  sh.601328  5.8900  5.9800  5.8800  5.9100   5.9000  136211574   \n",
       "4   2024-01-08  sh.601328  5.9000  5.9500  5.8500  5.8900   5.9100  107844154   \n",
       "5   2024-01-09  sh.601328  5.8700  5.9000  5.8100  5.8800   5.8900   98483058   \n",
       "6   2024-01-10  sh.601328  5.8700  5.9300  5.8400  5.8400   5.8800   79009451   \n",
       "7   2024-01-11  sh.601328  5.8400  5.8400  5.7500  5.7500   5.8400  142972319   \n",
       "8   2024-01-12  sh.601328  5.7500  5.8300  5.7500  5.8100   5.7500  105175081   \n",
       "9   2024-01-15  sh.601328  5.8200  5.8400  5.7800  5.8300   5.8100   91163069   \n",
       "10  2024-01-16  sh.601328  5.8100  5.8700  5.8000  5.8500   5.8300  120229236   \n",
       "11  2024-01-17  sh.601328  5.8500  5.8900  5.8000  5.8000   5.8500  150074308   \n",
       "12  2024-01-18  sh.601328  5.7900  5.8200  5.6200  5.7900   5.8000  272469361   \n",
       "13  2024-01-19  sh.601328  5.7600  5.7900  5.6800  5.7700   5.7900  147681383   \n",
       "14  2024-01-22  sh.601328  5.7700  5.8200  5.7300  5.7700   5.7700  219973362   \n",
       "15  2024-01-23  sh.601328  5.7600  5.7800  5.7200  5.7500   5.7700  117464032   \n",
       "16  2024-01-24  sh.601328  5.7600  5.8600  5.7500  5.8400   5.7500  183257551   \n",
       "17  2024-01-25  sh.601328  5.8700  5.9200  5.8400  5.9000   5.8400  179212006   \n",
       "18  2024-01-26  sh.601328  5.8900  5.9800  5.8700  5.9700   5.9000  209386150   \n",
       "19  2024-01-29  sh.601328  5.9900  6.0900  5.9800  6.0400   5.9700  188849038   \n",
       "20  2024-01-30  sh.601328  6.0300  6.0700  5.9500  5.9900   6.0400  133579069   \n",
       "21  2024-01-31  sh.601328  5.9800  6.0300  5.9400  6.0000   5.9900  144651806   \n",
       "\n",
       "             amount adjustflag      turn tradestatus     pctChg isST  \n",
       "0    725441819.4300          3  0.319200           1   0.696900    0  \n",
       "1    912662450.7600          3  0.398500           1   1.211100    0  \n",
       "2    741997555.2300          3  0.322000           1   0.854700    0  \n",
       "3    808284866.5700          3  0.347000           1   0.169500    0  \n",
       "4    635327397.4700          3  0.274800           1  -0.338400    0  \n",
       "5    576699973.4400          3  0.250900           1  -0.169800    0  \n",
       "6    463610158.3300          3  0.201300           1  -0.680300    0  \n",
       "7    828404502.6400          3  0.364300           1  -1.541100    0  \n",
       "8    610058254.6600          3  0.268000           1   1.043500    0  \n",
       "9    530401927.9500          3  0.232300           1   0.344200    0  \n",
       "10   702211617.2200          3  0.306300           1   0.343100    0  \n",
       "11   876332955.7200          3  0.382300           1  -0.854700    0  \n",
       "12  1556634115.9200          3  0.694200           1  -0.172400    0  \n",
       "13   848467507.6300          3  0.376300           1  -0.345400    0  \n",
       "14  1273837167.0600          3  0.560400           1   0.000000    0  \n",
       "15   674754623.0900          3  0.299300           1  -0.346600    0  \n",
       "16  1064419708.0200          3  0.466900           1   1.565200    0  \n",
       "17  1054502353.8200          3  0.456600           1   1.027400    0  \n",
       "18  1242152043.7000          3  0.533500           1   1.186400    0  \n",
       "19  1139529518.8600          3  0.481100           1   1.172500    0  \n",
       "20   803415134.2600          3  0.340300           1  -0.827800    0  \n",
       "21   866234311.1000          3  0.368500           1   0.166900    0  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#!pip install baostock -i https://pypi.tuna.tsinghua.edu.cn/simple \n",
    "import baostock as bs\n",
    "import pandas as pd\n",
    "lg=bs.login()#登录\n",
    "rs=bs.query_history_k_data_plus('sh.601328','date,code,open,high,low,close,preclose,volume,amount,adjustflag,turn,tradestatus,pctChg,isST',start_date='2024-01-01', end_date='2024-01-31',frequency='d', adjustflag='3')\n",
    "data_list=[]\n",
    "while (rs.error_code == '0') & rs.next():\n",
    "    data_list.append(rs.get_row_data())\n",
    "result2=pd.DataFrame(data_list, columns=rs.fields)\n",
    "result2.to_excel('BaoStock接口交通银行股票2024年1月历史K线数据.xlsx')\n",
    "result2"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b84b43c8",
   "metadata": {},
   "source": [
    "### 3.1.2网络爬虫"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "55df721a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "url='https://s.askci.com/stock/a/'\n",
    "tabs=pd.read_html(url) #爬取表格数据\n",
    "len(tabs) #结果输出"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "db75bb19",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1000, 15)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "dat=[]\n",
    "for i in range(1,51): \n",
    "    d=pd.read_html('https://s.askci.com/stock/a/0-0?reportTime=2023-09-30&pageNum='+str(i)+'#QueryCondition')[3]\n",
    "    #print(i)\n",
    "    dat.append(d)\n",
    "data=pd.concat(dat,axis=0)\n",
    "data.to_excel('A股上市企业股票数据.xlsx',index=False)\n",
    "data.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "39dd417c",
   "metadata": {},
   "source": [
    "## 3.2数据清洗"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6b5c316c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1960 entries, 0 to 1959\n",
      "Data columns (total 9 columns):\n",
      " #   Column  Non-Null Count  Dtype \n",
      "---  ------  --------------  ----- \n",
      " 0   股票代码    1960 non-null   int64 \n",
      " 1   股票简称    1960 non-null   object\n",
      " 2   省份      1960 non-null   object\n",
      " 3   城市      1959 non-null   object\n",
      " 4   主营业务收入  1906 non-null   object\n",
      " 5   净利润     1906 non-null   object\n",
      " 6   员工人数    1960 non-null   int64 \n",
      " 7   上市日期    1960 non-null   object\n",
      " 8   产品类型    1171 non-null   object\n",
      "dtypes: int64(2), object(7)\n",
      "memory usage: 137.9+ KB\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "data=pd.read_excel('上市企业基本信息.xlsx')\n",
    "data.info()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c144a1a0",
   "metadata": {},
   "source": [
    "### 3.2.1缺失值处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "80efebb1",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:15:16.929284Z",
     "start_time": "2024-06-05T02:15:16.922896Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "缺失值的数量：\n",
      " 股票代码        0\n",
      "股票简称        0\n",
      "省份          0\n",
      "城市          1\n",
      "主营业务收入     54\n",
      "净利润        54\n",
      "员工人数        0\n",
      "上市日期        0\n",
      "产品类型      789\n",
      "dtype: int64\n",
      "-=-=-=-=-=-=-=-=-=-=\n",
      "缺失值的占比：\n",
      " 股票代码      0.00\n",
      "股票简称      0.00\n",
      "省份        0.00\n",
      "城市        0.00\n",
      "主营业务收入    0.03\n",
      "净利润       0.03\n",
      "员工人数      0.00\n",
      "上市日期      0.00\n",
      "产品类型      0.40\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "# 显示缺失值的数量和占比\n",
    "missing_values=data.isnull().sum()\n",
    "print(\"缺失值的数量：\\n\",missing_values)\n",
    "print('-='*10)\n",
    "missing_ratio=missing_values/data.shape[0]\n",
    "print(\"缺失值的占比：\\n\",missing_ratio.round(2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "7f89d1ec",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1905, 8)\n"
     ]
    }
   ],
   "source": [
    "data=data.drop(['产品类型'],axis=1)#删除列\n",
    "data=data.dropna()#删除缺失值所在行\n",
    "data['城市']=data['城市'].fillna('深圳市') #缺失值填充\n",
    "data=data.reset_index(drop=True)# 重置索引\n",
    "print(data.shape) #输出数据形状"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "58bcd814",
   "metadata": {},
   "source": [
    "### 3.2.2重复值处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "92c92041",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:15:18.235565Z",
     "start_time": "2024-06-05T02:15:18.225937Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>股票代码</th>\n",
       "      <th>股票简称</th>\n",
       "      <th>省份</th>\n",
       "      <th>城市</th>\n",
       "      <th>主营业务收入</th>\n",
       "      <th>净利润</th>\n",
       "      <th>员工人数</th>\n",
       "      <th>上市日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>平安银行</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>1276.34亿</td>\n",
       "      <td>396.35亿</td>\n",
       "      <td>44077</td>\n",
       "      <td>1991-04-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>万科A</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>2903.08亿</td>\n",
       "      <td>210.27亿</td>\n",
       "      <td>131817</td>\n",
       "      <td>1991-01-29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5</td>\n",
       "      <td>ST星源</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>1.27亿</td>\n",
       "      <td>-1149.40万</td>\n",
       "      <td>488</td>\n",
       "      <td>1990-12-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>6</td>\n",
       "      <td>深振业A</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>8.29亿</td>\n",
       "      <td>-420.71万</td>\n",
       "      <td>425</td>\n",
       "      <td>1992-04-27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7</td>\n",
       "      <td>*ST全新</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>1.53亿</td>\n",
       "      <td>801.25万</td>\n",
       "      <td>76</td>\n",
       "      <td>1992-04-13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>8</td>\n",
       "      <td>神州高铁</td>\n",
       "      <td>北京</td>\n",
       "      <td>北京市</td>\n",
       "      <td>13.08亿</td>\n",
       "      <td>-1.46亿</td>\n",
       "      <td>2331</td>\n",
       "      <td>1992-05-07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>8</td>\n",
       "      <td>神州高铁</td>\n",
       "      <td>北京</td>\n",
       "      <td>北京市</td>\n",
       "      <td>13.08亿</td>\n",
       "      <td>-1.46亿</td>\n",
       "      <td>2331</td>\n",
       "      <td>1992-05-07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>9</td>\n",
       "      <td>中国宝安</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>245.17亿</td>\n",
       "      <td>14.89亿</td>\n",
       "      <td>17193</td>\n",
       "      <td>1991-06-25</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   股票代码   股票简称  省份   城市    主营业务收入        净利润    员工人数        上市日期\n",
       "0     1   平安银行  广东  深圳市  1276.34亿    396.35亿   44077  1991-04-03\n",
       "1     2    万科A  广东  深圳市  2903.08亿    210.27亿  131817  1991-01-29\n",
       "2     5   ST星源  广东  深圳市     1.27亿  -1149.40万     488  1990-12-10\n",
       "3     6   深振业A  广东  深圳市     8.29亿   -420.71万     425  1992-04-27\n",
       "4     7  *ST全新  广东  深圳市     1.53亿    801.25万      76  1992-04-13\n",
       "5     8   神州高铁  北京  北京市    13.08亿     -1.46亿    2331  1992-05-07\n",
       "6     8   神州高铁  北京  北京市    13.08亿     -1.46亿    2331  1992-05-07\n",
       "7     9   中国宝安  广东  深圳市   245.17亿     14.89亿   17193  1991-06-25"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head(8) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "843ecdb9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(1902, 8)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(data.shape[0]-data.drop_duplicates().shape[0]) #输出重复值个数\n",
    "data=data.drop_duplicates() #删除重复值\n",
    "data.shape #显示数据形状\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "391d94f3",
   "metadata": {},
   "source": [
    "### 3.2.3异常值处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "13038839",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiIAAAGbCAYAAAD5mfsKAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8pXeV/AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAezElEQVR4nO3df2xV9eH/8dflXnrLDb2X3tGIXK+2/HCFQoc01a4GEpNO3aSxELJhNUSMjhkm89dN1smyLFvaarCyLVM/LGhXf8wR09TNBXVT0w8u1uwidru55YfC1ba4BFw552LxKO39/rEv95NOCmUc+u5tn4/kxJxz7n2/3/51n5x7eq4nk8lkBAAAYMA00wsAAABTFyECAACMIUQAAIAxhAgAADCGEAEAAMYQIgAAwBhCBAAAGEOIAAAAY3ymF3Auw8PDOnLkiAoKCuTxeEwvBwAAjEEmk1E6ndbcuXM1bdro1z0mfIgcOXJE0WjU9DIAAMB/obe3V5dddtmo5yd8iBQUFEj69/9IMBg0vBoAADAWtm0rGo1mP8dHM+FD5PTXMcFgkBABACDHnOu2Cm5WBQAAxhAiAADAGEIEAAAYQ4gAAABjCBEAAGAMIQIAAIwhRAAAgDGECAAAMGbCP9AMwOQ0NDSk3bt36+OPP9all16qFStWyOv1ml4WgHF23ldEPvnkE5WUlCiVSmWPJRIJVVZWqrCwULFYTJlMZkznAExN7e3tWrBgga677jrV19fruuuu04IFC9Te3m56aQDG2XmFyLFjx7Rq1aoREeI4jmpra1VRUaF4PK5kMqnW1tZzngMwNbW3t2vt2rVaunSp3n77baXTab399ttaunSp1q5dS4wAU4wncx6XKGpqalRbW6t7771Xhw8fVnFxsTo6OnTHHXeor69PgUBA3d3d2rRpk956662znhsr27YVCoVkWRa/NQPkuKGhIS1YsEBLly5VR0fHiJ8GHx4eVl1dnRKJhA4ePMjXNECOG+vn93ldEdm+fbt+8IMfjDjW3d2tqqoqBQIBSVJ5ebmSyeQ5z43GcRzZtj1iAzA57N69W6lUSj/60Y9GRIgkTZs2TQ0NDTp8+LB2795taIUAxtt5hci8efO+dMy2bZWUlGT3PR6PvF6vBgYGznpuNE1NTQqFQtktGo2ezxIBTGAff/yxJGnJkiVnPH/6+OnXAZj8LvjPd30+n/x+/4hj+fn5GhwcPOu50TQ0NMiyrOzW29t7oUsEMEFceumlkv59E/uZnD5++nUAJr8LDpFwOKyjR4+OOJZOp5WXl3fWc6Px+/0KBoMjNgCTw4oVK1RcXKzGxkYNDw+PODc8PKympiaVlJRoxYoVhlYIYLxdcIhUVlaqq6sru59KpeQ4jsLh8FnPAZh6vF6vHn30Ub388suqq6sb8VczdXV1evnll7V161ZuVAWmkAsOkZUrV8qyLLW1tUmSmpubVVNTI6/Xe9ZzAKamNWvW6MUXX9Q//vEPVVdXKxgMqrq6WolEQi+++KLWrFljeokAxtF5/flu9k0eT/bPdyWpo6ND9fX1Kigo0NDQkDo7O1VWVnbOc2PBn+8CkxNPVgUmt7F+fv9XIXIm/f39isfjqq6uVlFR0ZjPnQshAgBA7hnr57drvzUTiUQUiUTO+xwAAJi6+PVdAABgDCECAACMIUQAAIAxhAgAADCGEAEAAMYQIgAAwBhCBAAAGEOIAAAAYwgRAABgDCECAACMIUQAAIAxhAgAADCGEAEAAMYQIgAAwBhCBAAAGEOIAAAAYwgRAABgDCECAACMIUQAAIAxhAgAADCGEAEAAMYQIgAAwBhCBAAAGEOIAAAAYwgRAABgDCECAACMIUQAAIAxhAgAADCGEAEAAMYQIgAAwBjXQuTpp5/WkiVLNGvWLN1yyy06duyYJCmRSKiyslKFhYWKxWLKZDJuTQkAAHKcKyHyl7/8RZs3b9Zjjz2m7u5u2bat1atXy3Ec1dbWqqKiQvF4XMlkUq2trW5MCQAAJgFPxoVLFOvXr9dXvvIVPfbYY5KkZDKpsrIyvfjii7rrrrvU19enQCCg7u5ubdq0SW+99daYx7ZtW6FQSJZlKRgMXuhSAQDAOBjr57fPjcmOHTumq666Krvv9Xol/ftrmaqqKgUCAUlSeXm5ksnkWcdyHEeO42T3bdt2Y4kAAGACcuWrmWXLlukPf/hD9v6Pp59+WldffbVs21ZJSUn2dR6PR16vVwMDA6OO1dTUpFAolN2i0agbSwQAABOQKyHy4IMP6vPPP1dFRYWqq6v18MMP6/vf/758Pp/8fv+I1+bn52twcHDUsRoaGmRZVnbr7e11Y4kAAGACciVEwuGw/vrXv2rnzp0qLy9XaWmp6uvrFQ6HdfTo0RGvTafTysvLG3Usv9+vYDA4YgMAAJOTq88RmTt3rtrb29XU1CSv16vKykp1dXVlz6dSKTmOo3A47Oa0AAAgR7kaIr/61a9UWlqquro6SdLKlStlWZba2tokSc3NzaqpqcnezAoAAKY2V/5qRpKOHz+uRx55RK+88sr/De7zafv27aqvr1csFtPQ0JA6OzvdmhIAAOQ4V54jci79/f2Kx+Oqrq5WUVHReb2X54gAAJB7xvU5IucSiUQUiUTGYyoAAJBD+NE7AABgDCECAACMIUQAAIAxhAgAADCGEAEAAMYQIgAAwBhCBAAAGEOIAAAAYwgRAABgDCECAACMIUQAAIAxhAgAADCGEAEAAMYQIgAAwBhCBAAAGEOIAAAAYwgRAABgDCECAACMIUQAAIAxhAgAADCGEAEAAMYQIgAAwBhCBAAAGEOIAAAAYwgRAABgDCECAACMIUQAAIAxhAgAADCGEAEAAMYQIgAAwBjXQuSZZ57R5ZdfrpkzZ6qmpkapVEqSlEgkVFlZqcLCQsViMWUyGbemBAAAOc6VEPnggw/00EMPqaOjQ8lkUldccYVuv/12OY6j2tpaVVRUKB6PK5lMqrW11Y0pAQDAJOBKiOzdu1dVVVVavny5Lr/8cm3YsEEHDhzQrl27ZFmWWlpaNH/+fDU2NmrHjh1uTAkAACYBnxuDLF68WG+88Yb27t2refPm6de//rW+8Y1vqLu7W1VVVQoEApKk8vJyJZPJs47lOI4cx8nu27btxhIBAMAE5MoVkcWLF2vt2rVavny5Zs2apXfeeUdbt26VbdsqKSnJvs7j8cjr9WpgYGDUsZqamhQKhbJbNBp1Y4kAAGACciVEurq69Mc//lHvvPOO0um0brnlFn3rW9+Sz+eT3+8f8dr8/HwNDg6OOlZDQ4Msy8puvb29biwRAABMQK6EyO9//3utW7dOV199tWbOnKmf//znOnTokMLhsI4ePTritel0Wnl5eaOO5ff7FQwGR2wAAGBycuUekVOnTo34uiWdTuvTTz+Vz+dTV1dX9ngqlZLjOAqHw25MCwAAcpwrV0SuvfZatbe367HHHtPzzz+vuro6XXLJJdq8ebMsy1JbW5skqbm5WTU1NfJ6vW5MCwAAcpwrV0S+853vaP/+/dq2bZs+/vhjLVmyRO3t7Zo+fbq2b9+u+vp6xWIxDQ0NqbOz040pAQDAJODJjMOjTvv7+xWPx1VdXa2ioqLzeq9t2wqFQrIsi/tFAADIEWP9/Hblisi5RCIRRSKR8ZgKAADkEH70DgAAGEOIAAAAYwgRAABgDCECAACMIUQAAIAxhAgAADCGEAEAAMYQIgAAwBhCBAAAGEOIAAAAYwgRAABgDCECAACMIUQAAIAxhAgAADCGEAEAAMYQIgAAwBhCBAAAGEOIAAAAYwgRAABgDCECAACMIUQAAIAxhAgAADCGEAEAAMYQIgAAwBhCBAAAGEOIAAAAYwgRAABgDCECAACMIUQAAIAxhAgAADDGtRBpbW2Vx+P50tba2qpEIqHKykoVFhYqFospk8m4NS0AAMhhroVIfX29BgYGsltvb69mz56tr3/966qtrVVFRYXi8biSyaRaW1vdmhYAAOQw10IkLy9Ps2bNym5tbW1as2aNenp6ZFmWWlpaNH/+fDU2NmrHjh1uTQsAAHKY72IM+tlnn+kXv/iF3nnnHf32t79VVVWVAoGAJKm8vFzJZHLU9zqOI8dxsvu2bV+MJQIAgAngotys+vzzz6uqqkrFxcWybVslJSXZcx6PR16vVwMDA2d8b1NTk0KhUHaLRqMXY4kAAGACuCgh8uSTT+p73/ueJMnn88nv9484n5+fr8HBwTO+t6GhQZZlZbfe3t6LsUQAADABuP7VzPvvv6/3339fNTU1kqRwOKxEIjHiNel0Wnl5eWd8v9/v/1K4AACAycn1KyI7d+7UqlWrNH36dElSZWWlurq6sudTqZQcx1E4HHZ7agAAkGNcD5FXXnlF1113XXZ/5cqVsixLbW1tkqTm5mbV1NTI6/W6PTUAAMgxnoyLTxc7efKkZs2ape7ubpWWlmaPd3R0qL6+XgUFBRoaGlJnZ6fKysrGNKZt2wqFQrIsS8Fg0K2lAgCAi2isn9+u3iMyY8aMEX96e1pdXZ0OHjyoeDyu6upqFRUVuTktAADIURflOSJnEolEFIlExms6AACQA/jROwAAYAwhAgAAjCFEAACAMYQIAAAwhhABAADGECIAAMAYQgQAABhDiAAAAGMIEQAAYAwhAgAAjCFEAACAMYQIAAAwhhABAADGECIAAMAYQgQAABhDiAAAAGMIEQAAYAwhAgAAjCFEAACAMYQIAAAwhhABAADGECIAAMAYQgQAABhDiAAAAGMIEQAAYAwhAgAAjCFEAACAMYQIAAAwhhABAADGECIAAMCYixIiP/zhD1VbW5vdTyQSqqysVGFhoWKxmDKZzMWYFgAA5BjXQySRSOjxxx/Xtm3bJEmO46i2tlYVFRWKx+NKJpNqbW11e1oAAJCDXA2RTCajjRs36t5779X8+fMlSbt27ZJlWWppadH8+fPV2NioHTt2uDktAADIUa6GyG9+8xu99957Kikp0csvv6wvvvhC3d3dqqqqUiAQkCSVl5crmUyOOobjOLJte8QGAAAmJ9dC5MSJE9qyZYsWLlyovr4+tbS0aOXKlbJtWyUlJdnXeTweeb1eDQwMnHGcpqYmhUKh7BaNRt1aIgAAmGBcC5H29nZ9+umneuONN/TjH/9Yr732mo4fP66nnnpKfr9/xGvz8/M1ODh4xnEaGhpkWVZ26+3tdWuJAABggvG5NVBfX5+uueYahcPhfw/s86m8vFypVEpHjx4d8dp0Oq28vLwzjuP3+78ULgAAYHJy7YpINBrVyZMnRxz78MMP9eijj6qrqyt7LJVKyXGcbLAAAICpy7UQuemmm9TT06Mnn3xSfX19+uUvf6n33ntP119/vSzLUltbmySpublZNTU18nq9bk0NAABylGtfzYTDYb3yyit64IEHdP/992vOnDl64YUXtGDBAm3fvl319fWKxWIaGhpSZ2enW9MCAIAc5smM02NO+/v7FY/HVV1draKiojG/z7ZthUIhWZalYDB4EVcIAADcMtbPb9euiJxLJBJRJBIZr+kAAEAO4EfvAACAMYQIAAAwhhABAADGECIAAMAYQgQAABhDiAAAAGMIEQAAYAwhAgAAjCFEAACAMYQIAAAwhhABAADGECIAAMAYQgQAABhDiAAAAGMIEQAAYAwhAgAAjCFEAACAMYQIAAAwhhABAADGECIAAMAYQgQAABhDiAAAAGMIEQAAYAwhAgAAjCFEAACAMYQIAAAwhhABAADGECIAAMAYQgQAABhDiAAAAGNcC5F77rlHHo8nuy1YsECSlEgkVFlZqcLCQsViMWUyGbemBAAAOc61ENmzZ4/+9Kc/aWBgQAMDA9q7d68cx1Ftba0qKioUj8eVTCbV2trq1pQAACDHeTIuXKI4deqUwuGwjhw5opkzZ2aPd3R06I477lBfX58CgYC6u7u1adMmvfXWW2Me27ZthUIhWZalYDB4oUsFAADjYKyf365cEfn73/+uTCajZcuWacaMGbrxxhv10Ucfqbu7W1VVVQoEApKk8vJyJZPJs47lOI5s2x6xAQCAycmVEOnp6VFZWZl+97vfKZlMavr06dq4caNs21ZJSUn2dR6PR16vVwMDA6OO1dTUpFAolN2i0agbSwQAABOQK1/N/KcPP/xQ8+bN0+bNm+XxeNTS0pI9F41G1dXVpUgkcsb3Oo4jx3Gy+7ZtKxqN8tUMAAA5ZKxfzfguxuSzZs3S8PCw5syZo0QiMeJcOp1WXl7eqO/1+/3y+/0XY1kAAGCCceWrmfvvv187d+7M7v/tb3/TtGnTtHTpUnV1dWWPp1IpOY6jcDjsxrQAACDHuXJFZNmyZXrooYc0Z84cnTp1Svfcc49uv/12XX/99bIsS21tbVq/fr2am5tVU1Mjr9frxrQAACDHuRIi69evV09Pj26++WYVFBRo9erVamxslM/n0/bt21VfX69YLKahoSF1dna6MSUAAJgELsrNqv+pv79f8Xhc1dXVKioqOq/38hwRAAByj9GbVf9TJBIZ9a9kAADA1MWP3gEAAGMIEQAAYAwhAgAAjCFEAACAMYQIAAAwhhABAADGECIAAMAYQgQAABhDiAAAAGMIEQAAYAwhAgAAjCFEAACAMYQIAAAwhhABAADGECIAAMAYQgQAABhDiAAAAGMIEQAAYAwhAgAAjCFEAACAMYQIAAAwhhABAADGECIAAMAYQgQAABhDiAAAAGMIEQAAYAwhAgAAjCFEAACAMYQIAAAwhhABAADGXJQQufHGG9Xa2ipJSiQSqqysVGFhoWKxmDKZzMWYEgAA5CDXQ+S5557Tq6++KklyHEe1tbWqqKhQPB5XMpnMBgoAAICrIfKvf/1LDzzwgL761a9Kknbt2iXLstTS0qL58+ersbFRO3bscHNKAACQw3xuDvbAAw9o9erVOnnypCSpu7tbVVVVCgQCkqTy8nIlk8mzjuE4jhzHye7btu3mEgEAwATi2hWRN998U6+//roefvjh7DHbtlVSUpLd93g88nq9GhgYGHWcpqYmhUKh7BaNRt1aIgAAmGBcCZHPPvtMGzdu1BNPPKFgMJg97vP55Pf7R7w2Pz9fg4ODo47V0NAgy7KyW29vrxtLBAAAE5ArX8387Gc/U2VlpW666aYRx8PhsBKJxIhj6XRaeXl5o47l9/u/FC8AAGByciVEnn/+eR09elSzZs2SJA0ODmrnzp0qLi7WF198kX1dKpWS4zgKh8NuTAsAAHKcKyGye/dunTp1Krv/4IMPqqqqSrfffrsWL16strY2rV+/Xs3NzaqpqZHX63VjWgAAkONcCZHLLrtsxP7MmTM1e/ZszZ49W9u3b1d9fb1isZiGhobU2dnpxpQAAGAS8GTG4VGn/f39isfjqq6uVlFR0Xm917ZthUIhWZY14kZYAAAwcY3189vV54iMJhKJKBKJjMdUAAAgh/CjdwAAwBhCBAAAGEOIAAAAYwgRAABgDCECAACMIUQAAIAxhAgAADCGEAEAAMYQIgAAwBhCBAAAGEOIAAAAYwgRAABgDCECAACMIUQAAIAxhAgAADCGEAEAAMYQIgAAwBhCBAAAGEOIAAAAYwgRAABgDCECAACMIUQAAIAxhAgAADCGEAEAAMYQIgAAwBhCBAAAGEOIAAAAYwgRAABgDCECAACM8bk94CeffKL9+/fryiuv1OzZs90eHoBhg4OD2rdvnytjnTx5UqlUSsXFxZoxY8YFj1daWqpAIODCygCMF1dD5IUXXtDdd9+t4uJi7d+/X0899ZTWrVunRCKhDRs26P3339edd96pRx55RB6Px82pAYyTffv2qaKiwvQyzmjPnj1avny56WUAOA+eTCaTcWOg48ePa+HChXrzzTe1ZMkSPfPMM9qyZYsOHDig0tJS3XDDDYrFYtq8ebPWrl2rDRs2jGlc27YVCoVkWZaCwaAbSwVwAdy8ItLT06PbbrtNzz77rBYtWnTB43FFBJg4xvr57doVkXQ6rW3btmnJkiWSpK997WsaGBjQrl27ZFmWWlpaFAgE1NjYqE2bNo05RABMLIFAwPWrDosWLeJKBjBFuRYi0WhUt956qyTpiy++0NatW7VmzRp1d3erqqoq+6+U8vJyJZPJUcdxHEeO42T3bdt2a4kAAGCCcf2vZrq7u3XJJZfotdde07Zt22TbtkpKSrLnPR6PvF6vBgYGzvj+pqYmhUKh7BaNRt1eIgAAmCBcD5Hy8nK9/vrrKisr04YNG+Tz+eT3+0e8Jj8/X4ODg2d8f0NDgyzLym69vb1uLxEAAEwQroeIx+PRVVddpdbWVr300ksKh8M6evToiNek02nl5eWd8f1+v1/BYHDEBgAAJifXQuSNN95QLBbL7vt8/779pLS0VF1dXdnjqVRKjuMoHA67NTUAAMhRrt2sWlpaqrq6Oi1cuFDf/OY3tWXLFl1//fW66aabdNddd6mtrU3r169Xc3Ozampq5PV63ZoawBgdPHhQ6XTa9DKyenp6Rvx3oigoKNDChQtNLwOYElx7jogkvfrqq7rvvvvU19enG264QY8//riKiorU0dGh+vp6FRQUaGhoSJ2dnSorKxvTmDxHBHDHwYMHdeWVV5peRs44cOAAMQJcgHF/jogk3XDDDWf809y6ujodPHhQ8Xhc1dXVKioqcnNaAGNw+kqIWw8Pc4Pbj3h3w+mHrE2kK0fAZOb6b82MJhKJKBKJjNd0AEYx0R4edu2115peAgCD+PVdAABgDCECAACMIUQAAIAxhAgAADCGEAEAAMYQIgAAwBhCBAAAGEOIAAAAY8btgWYAzPKc+kxXzZmmGccPSEf4N8hoZhw/oKvmTJPn1GemlwJMCYQIMEXkn/hI726cKf3vRul/Ta9m4lok6d2NM9Vz4iNJ1aaXA0x6hAgwRXw283It/58Teu6557SotNT0ciasnn37dOutt2rHty43vRRgSiBEgCki48vX3n8O6+SsK6W5y0wvZ8I6+c9h7f3nsDK+fNNLAaYEvigGAADGECIAAMAYQgQAABhDiAAAAGMIEQAAYAwhAgAAjCFEAACAMYQIAAAwhhABAADG8GRVYIoYHByUJL377ruGV/J/Tp48qVQqpeLiYs2YMcP0ciRJPT09ppcATCmECDBF7Nu3T5J01113GV5JbigoKDC9BGBKIESAKaKurk6SVFpaqkAgYHYx/19PT49uu+02Pfvss1q0aJHp5WQVFBRo4cKFppcBTAmECDBFzJ49W3feeafpZZzRokWLtHz5ctPLAGAAN6sCAABjCBEAAGAMIQIAAIwhRAAAgDGECAAAMMa1EHnppZc0b948+Xw+XXPNNdmHAiUSCVVWVqqwsFCxWEyZTMatKQEAQI5zJUQ++OADbdiwQc3Nzerv79cVV1yhO++8U47jqLa2VhUVFYrH40omk2ptbXVjSgAAMAm4EiI9PT1qbGzUt7/9bV1yySW6++67FY/HtWvXLlmWpZaWFs2fP1+NjY3asWOHG1MCAIBJwJUHmq1atWrE/v79+7VgwQJ1d3erqqoq+xTH8vJyJZPJs47lOI4cx8nu27btxhIBuGRwcDD7uPgLdforXLd+32UiPTUWwNi4/mTVzz//XFu3btV9992nQ4cOqaSkJHvO4/HI6/VqYGBAhYWFZ3x/U1OTfvrTn7q9LAAu2bdvnyoqKlwd87bbbnNlnD179vCEViDHuB4iW7Zs0cyZM/Xd735XW7Zskd/vH3E+Pz9fg4ODo4ZIQ0OD7r///uy+bduKRqNuLxPAf6m0tFR79uxxZSy3f323tLTUhVUBGE+uhsif//xnPfnkk+rq6tL06dMVDoeVSCRGvCadTisvL2/UMfx+/5fiBcDEEQgEXL3qcO2117o2FoDc49qf7x46dEi33nqrnnjiCS1evFiSVFlZqa6uruxrUqmUHMdROBx2a1oAAJDDXAmRkydPatWqVaqrq9PNN9+sEydO6MSJE1qxYoUsy1JbW5skqbm5WTU1NfJ6vW5MCwAAcpwn48ITxjo6OrR69eovHT98+LDee+891dfXq6CgQENDQ+rs7FRZWdmYx7ZtW6FQSJZlKRgMXuhSAQDAOBjr57crIXIu/f39isfjqq6uVlFR0Xm9lxABACD3jPXz2/W/mjmTSCSiSCQyHlMBAIAcwo/eAQAAYwgRAABgDCECAACMIUQAAIAxhAgAADCGEAEAAMYQIgAAwBhCBAAAGDMuDzS7EKcf/GrbtuGVAACAsTr9uX2uB7hP+BBJp9OSpGg0anglAADgfKXTaYVCoVHPj8tvzVyI4eFhHTlyRAUFBfJ4PKaXA8BFtm0rGo2qt7eX35ICJplMJqN0Oq25c+dq2rTR7wSZ8CECYPLiRy0BcLMqAAAwhhABAADGECIAjPH7/frJT34iv99veikADOEeEQAAYAxXRAAAgDGECAAAMIYQAQAAxhAiAIz45JNPVFJSolQqZXopAAwiRACMu2PHjmnVqlVECABCBMD4W7dundatW2d6GQAmAP58F8C4O3TokObNmyePx6PDhw+ruLjY9JIAGMIVEQDjbt68eaaXAGCCIEQAAIAxhAgAADCGEAEAAMYQIgAAwBhCBAAAGOMzvQAAUxdPDwDAFREAAGAMIQIAAIwhRAAAgDGECAAAMIYQAQAAxhAiAADAGEIEAAAYQ4gAAABjCBEAAGAMIQIAAIwhRAAAgDH/D4x+7sL25phDAAAAAElFTkSuQmCC",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "plt.rcParams['font.sans-serif']=['SimHei'] #设置中文字体\n",
    "df=[20,21,22,23,24,25,26,27,28,29,30,31,100] #数据示例\n",
    "plt.boxplot(df) #绘制箱线图\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0cc0e648",
   "metadata": {},
   "source": [
    "## 3.3数据变换\n",
    "### 3.3.1数据类型变换"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "539ca385",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:15:23.420372Z",
     "start_time": "2024-06-05T02:15:23.415447Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "a的数据类型: <class 'int'>\n",
      "b的数据类型: <class 'str'>\n"
     ]
    }
   ],
   "source": [
    "a=12\n",
    "print(\"a的数据类型:\",type(a))\n",
    "b=str(a)\n",
    "print(\"b的数据类型:\",type(b))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "192cf134",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "a的数据类型: <class 'str'>\n",
      "b的数据类型: <class 'int'>\n"
     ]
    }
   ],
   "source": [
    "a=\"12\"\n",
    "print(\"a的数据类型:\",type(a))\n",
    "b=int(a)\n",
    "print(\"b的数据类型:\",type(b))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "d5f9faae",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:15:23.888449Z",
     "start_time": "2024-06-05T02:15:23.884134Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "a的数据类型: <class 'str'>\n",
      "b的数据类型: <class 'float'>\n"
     ]
    }
   ],
   "source": [
    "a=\"42.5\"\n",
    "print(\"a的数据类型:\",type(a))\n",
    "b=float(a)\n",
    "print(\"b的数据类型:\",type(b))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "a6159689",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:15:24.413960Z",
     "start_time": "2024-06-05T02:15:24.410251Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "a的数据类型: <class 'str'>\n",
      "b的数据类型: <class 'float'>\n"
     ]
    }
   ],
   "source": [
    "a=\"42.5\"\n",
    "print(\"a的数据类型:\",type(a))\n",
    "b=float(a)\n",
    "print(\"b的数据类型:\",type(b))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "1d30334e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 3 entries, 0 to 2\n",
      "Data columns (total 2 columns):\n",
      " #   Column   Non-Null Count  Dtype \n",
      "---  ------   --------------  ----- \n",
      " 0   column1  3 non-null      object\n",
      " 1   column2  3 non-null      object\n",
      "dtypes: object(2)\n",
      "memory usage: 180.0+ bytes\n",
      "-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 3 entries, 0 to 2\n",
      "Data columns (total 2 columns):\n",
      " #   Column   Non-Null Count  Dtype  \n",
      "---  ------   --------------  -----  \n",
      " 0   column1  3 non-null      int32  \n",
      " 1   column2  3 non-null      float64\n",
      "dtypes: float64(1), int32(1)\n",
      "memory usage: 168.0 bytes\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df=pd.DataFrame({'column1': [1, '2', 3],\n",
    "   'column2':  [1.5, '3', 2.5]}) #创建一个包含数值和字符串的DataFrame\n",
    "df.info() #显示DataFrame列信息\n",
    "print('-='*20) #分隔符\n",
    "df['column1']=df['column1'].astype(int) #将数据类型转换为整型\n",
    "df['column2']=df['column2'].astype(float) #将数据类型转换为浮点型\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "f9361ffd",
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_b_to_m(value): # 定义一个函数来转换单位到万\n",
    "    if '亿' in value:\n",
    "        return float(value.replace('亿', '')) * 10000\n",
    "    elif '万' in value:\n",
    "        return float(value.replace('万', ''))\n",
    "    else:\n",
    "        return float(value)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "61936e28",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:15:26.500478Z",
     "start_time": "2024-06-05T02:15:26.487786Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>股票代码</th>\n",
       "      <th>股票简称</th>\n",
       "      <th>省份</th>\n",
       "      <th>城市</th>\n",
       "      <th>主营业务收入</th>\n",
       "      <th>净利润</th>\n",
       "      <th>员工人数</th>\n",
       "      <th>上市日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>平安银行</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>12763400.0</td>\n",
       "      <td>3963500.00</td>\n",
       "      <td>44077</td>\n",
       "      <td>1991-04-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>万科A</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>29030800.0</td>\n",
       "      <td>2102700.00</td>\n",
       "      <td>131817</td>\n",
       "      <td>1991-01-29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5</td>\n",
       "      <td>ST星源</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>12700.0</td>\n",
       "      <td>-1149.40</td>\n",
       "      <td>488</td>\n",
       "      <td>1990-12-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>6</td>\n",
       "      <td>深振业A</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>82900.0</td>\n",
       "      <td>-420.71</td>\n",
       "      <td>425</td>\n",
       "      <td>1992-04-27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>7</td>\n",
       "      <td>*ST全新</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>15300.0</td>\n",
       "      <td>801.25</td>\n",
       "      <td>76</td>\n",
       "      <td>1992-04-13</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   股票代码   股票简称  省份   城市      主营业务收入         净利润    员工人数        上市日期\n",
       "0     1   平安银行  广东  深圳市  12763400.0  3963500.00   44077  1991-04-03\n",
       "1     2    万科A  广东  深圳市  29030800.0  2102700.00  131817  1991-01-29\n",
       "2     5   ST星源  广东  深圳市     12700.0    -1149.40     488  1990-12-10\n",
       "3     6   深振业A  广东  深圳市     82900.0     -420.71     425  1992-04-27\n",
       "4     7  *ST全新  广东  深圳市     15300.0      801.25      76  1992-04-13"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 应用转换函数到数据框的列\n",
    "data['主营业务收入']=data['主营业务收入'].apply(convert_b_to_m)\n",
    "data['净利润']=data['净利润'].apply(convert_b_to_m) \n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "2f9afce9",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:15:27.074926Z",
     "start_time": "2024-06-05T02:15:27.065979Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Index: 1902 entries, 0 to 1904\n",
      "Data columns (total 8 columns):\n",
      " #   Column  Non-Null Count  Dtype  \n",
      "---  ------  --------------  -----  \n",
      " 0   股票代码    1902 non-null   int64  \n",
      " 1   股票简称    1902 non-null   object \n",
      " 2   省份      1902 non-null   object \n",
      " 3   城市      1902 non-null   object \n",
      " 4   主营业务收入  1902 non-null   float64\n",
      " 5   净利润     1902 non-null   float64\n",
      " 6   员工人数    1902 non-null   int64  \n",
      " 7   上市日期    1902 non-null   object \n",
      "dtypes: float64(2), int64(2), object(4)\n",
      "memory usage: 133.7+ KB\n"
     ]
    }
   ],
   "source": [
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "217ffef7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>股票代码</th>\n",
       "      <th>股票简称</th>\n",
       "      <th>省份</th>\n",
       "      <th>城市</th>\n",
       "      <th>主营业务收入</th>\n",
       "      <th>净利润</th>\n",
       "      <th>员工人数</th>\n",
       "      <th>上市日期</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>000001</td>\n",
       "      <td>平安银行</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>12763400.0</td>\n",
       "      <td>3963500.00</td>\n",
       "      <td>44077</td>\n",
       "      <td>1991-04-03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>000002</td>\n",
       "      <td>万科A</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>29030800.0</td>\n",
       "      <td>2102700.00</td>\n",
       "      <td>131817</td>\n",
       "      <td>1991-01-29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>000005</td>\n",
       "      <td>ST星源</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>12700.0</td>\n",
       "      <td>-1149.40</td>\n",
       "      <td>488</td>\n",
       "      <td>1990-12-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>000006</td>\n",
       "      <td>深振业A</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>82900.0</td>\n",
       "      <td>-420.71</td>\n",
       "      <td>425</td>\n",
       "      <td>1992-04-27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>000007</td>\n",
       "      <td>*ST全新</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>15300.0</td>\n",
       "      <td>801.25</td>\n",
       "      <td>76</td>\n",
       "      <td>1992-04-13</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     股票代码   股票简称  省份   城市      主营业务收入         净利润    员工人数        上市日期\n",
       "0  000001   平安银行  广东  深圳市  12763400.0  3963500.00   44077  1991-04-03\n",
       "1  000002    万科A  广东  深圳市  29030800.0  2102700.00  131817  1991-01-29\n",
       "2  000005   ST星源  广东  深圳市     12700.0    -1149.40     488  1990-12-10\n",
       "3  000006   深振业A  广东  深圳市     82900.0     -420.71     425  1992-04-27\n",
       "4  000007  *ST全新  广东  深圳市     15300.0      801.25      76  1992-04-13"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def fill_zero(x):\n",
    "    x=str(x) #将x转换为字符串\n",
    "    xx=x.zfill(6) #用0补全6位\n",
    "    return(xx)\n",
    "data['股票代码']=data['股票代码'].apply(fill_zero) #批量应用补全函数\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3430f565",
   "metadata": {},
   "source": [
    "### 3.3.2数值型特征归一化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "2fc7e3c2",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:15:40.191715Z",
     "start_time": "2024-06-05T02:15:28.918858Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 5.63702839, 19.90055426,  2.19140122],\n",
       "       [13.2365609 , 10.45569722,  7.1434334 ],\n",
       "       [-0.31963089, -0.22280493, -0.26875528],\n",
       "       ...,\n",
       "       [-0.28309871, -0.23167375, -0.2441475 ],\n",
       "       [ 0.02466848,  0.21801631,  0.23542188],\n",
       "       [-0.27319485, -0.22740412, -0.24939641]])"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X=data[['主营业务收入','净利润','员工人数']] \n",
    "from sklearn.preprocessing import StandardScaler\n",
    "X_scaler=StandardScaler()  #创建一个StandardScaler对象\n",
    "X_scaled=X_scaler.fit_transform(X)  #使用fit_transform()方法对输入数据X进行Z-score标准化\n",
    "X_scaled #结果输出"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "6eb6a763",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:15:40.205250Z",
     "start_time": "2024-06-05T02:15:40.194708Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[3.02244938e-01, 1.00000000e+00, 7.72697529e-02],\n",
       "       [6.87481911e-01, 6.04245092e-01, 2.31191746e-01],\n",
       "       [2.88813348e-04, 1.56798443e-01, 8.01713597e-04],\n",
       "       ...,\n",
       "       [2.14070924e-03, 1.56426825e-01, 1.56658696e-03],\n",
       "       [1.77421033e-02, 1.75269572e-01, 1.64728461e-02],\n",
       "       [2.64275775e-03, 1.56605730e-01, 1.40343737e-03]])"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X=data[['主营业务收入','净利润','员工人数']] \n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "X_minmax_scaler=MinMaxScaler() #创建一个MinMaxScaler对象\n",
    "X_minmax_scaled=X_minmax_scaler.fit_transform(X) #使用fit_transform()方法对输入数据X进行Min-max标准化\n",
    "X_minmax_scaled"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "20ebd3fd",
   "metadata": {},
   "source": [
    "### 3.3.3类别型特征编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "87e05717",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:15:40.405123Z",
     "start_time": "2024-06-05T02:15:40.207253Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1, 0, 1, 0, 1])"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import LabelEncoder\n",
    "df=pd.DataFrame({'性别':['男','女','男','女','男'],\n",
    "              '年龄': [10,22,23,24,15],\n",
    "              '身高':[160,150,180,165,170]})#  创建一个示例数据集\n",
    "label_encoder=LabelEncoder()#  创建LabelEncoder对象\n",
    "label_encoder.fit_transform(df['性别'])#将分类变量进行性别编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "492c6f77",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:15:40.608992Z",
     "start_time": "2024-06-05T02:15:40.407119Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>年龄</th>\n",
       "      <th>身高</th>\n",
       "      <th>性别_女</th>\n",
       "      <th>性别_男</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10</td>\n",
       "      <td>160</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>22</td>\n",
       "      <td>150</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>23</td>\n",
       "      <td>180</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>24</td>\n",
       "      <td>165</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>15</td>\n",
       "      <td>170</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   年龄   身高   性别_女   性别_男\n",
       "0  10  160  False   True\n",
       "1  22  150   True  False\n",
       "2  23  180  False   True\n",
       "3  24  165   True  False\n",
       "4  15  170  False   True"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df=pd.DataFrame({'性别':['男','女','男','女','男'],\n",
    "              '年龄': [10,22,23,24,15],\n",
    "              '身高':[160,150,180,165,170]})# \n",
    "pd.get_dummies(df,columns=['性别'])  #将分类变量转换为哑变量"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "30c52a4e",
   "metadata": {},
   "source": [
    "### 3.3.4日期变量衍生"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "7e7c46c6",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:15:40.947558Z",
     "start_time": "2024-06-05T02:15:40.610986Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2024 2 10\n"
     ]
    }
   ],
   "source": [
    "from datetime import datetime\n",
    "str_date=pd.to_datetime('2024-02-10')\n",
    "year=str_date.year\n",
    "month=str_date.month\n",
    "day=str_date.day\n",
    "print(year,month,day)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "300d144f",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:15:41.844248Z",
     "start_time": "2024-06-05T02:15:40.949555Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>股票代码</th>\n",
       "      <th>股票简称</th>\n",
       "      <th>省份</th>\n",
       "      <th>城市</th>\n",
       "      <th>主营业务收入</th>\n",
       "      <th>净利润</th>\n",
       "      <th>员工人数</th>\n",
       "      <th>上市日期</th>\n",
       "      <th>年</th>\n",
       "      <th>月</th>\n",
       "      <th>日</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>000001</td>\n",
       "      <td>平安银行</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>12763400.0</td>\n",
       "      <td>3963500.00</td>\n",
       "      <td>44077</td>\n",
       "      <td>1991-04-03</td>\n",
       "      <td>1991</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>000002</td>\n",
       "      <td>万科A</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>29030800.0</td>\n",
       "      <td>2102700.00</td>\n",
       "      <td>131817</td>\n",
       "      <td>1991-01-29</td>\n",
       "      <td>1991</td>\n",
       "      <td>1</td>\n",
       "      <td>29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>000005</td>\n",
       "      <td>ST星源</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>12700.0</td>\n",
       "      <td>-1149.40</td>\n",
       "      <td>488</td>\n",
       "      <td>1990-12-10</td>\n",
       "      <td>1990</td>\n",
       "      <td>12</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>000006</td>\n",
       "      <td>深振业A</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>82900.0</td>\n",
       "      <td>-420.71</td>\n",
       "      <td>425</td>\n",
       "      <td>1992-04-27</td>\n",
       "      <td>1992</td>\n",
       "      <td>4</td>\n",
       "      <td>27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>000007</td>\n",
       "      <td>*ST全新</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>15300.0</td>\n",
       "      <td>801.25</td>\n",
       "      <td>76</td>\n",
       "      <td>1992-04-13</td>\n",
       "      <td>1992</td>\n",
       "      <td>4</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     股票代码   股票简称  省份   城市      主营业务收入         净利润    员工人数        上市日期     年  \\\n",
       "0  000001   平安银行  广东  深圳市  12763400.0  3963500.00   44077  1991-04-03  1991   \n",
       "1  000002    万科A  广东  深圳市  29030800.0  2102700.00  131817  1991-01-29  1991   \n",
       "2  000005   ST星源  广东  深圳市     12700.0    -1149.40     488  1990-12-10  1990   \n",
       "3  000006   深振业A  广东  深圳市     82900.0     -420.71     425  1992-04-27  1992   \n",
       "4  000007  *ST全新  广东  深圳市     15300.0      801.25      76  1992-04-13  1992   \n",
       "\n",
       "    月   日  \n",
       "0   4   3  \n",
       "1   1  29  \n",
       "2  12  10  \n",
       "3   4  27  \n",
       "4   4  13  "
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "date_series=data['上市日期'].apply(pd.to_datetime)\n",
    "#提取年、月、日\n",
    "year=date_series.dt.year\n",
    "month=date_series.dt.month\n",
    "day=date_series.dt.day \n",
    "#赋值给DataFrame\n",
    "data['年']=year\n",
    "data['月']=month\n",
    "data['日']=day\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0ab928d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "74a3304e",
   "metadata": {},
   "source": [
    "## 本章实训——A股股银行行业历史行情数据获取及预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "d7c49092",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[     0       1     2          3\n",
       " 0   排名    股票代码  企业简称  营业收入 （亿元）\n",
       " 1    1  600028  中国石化   32122.15\n",
       " 2    2  601857  中国石油   30110.12\n",
       " 3    3  601668  中国建筑   22655.29\n",
       " 4    4  601390  中国中铁   12634.75\n",
       " 5    5  601186  中国铁建   11379.93\n",
       " 6    6  600941  中国移动   10093.09\n",
       " 7    7  601318  中国平安    9137.89\n",
       " 8    8  601398  工商银行    8430.70\n",
       " 9    9  601628  中国人寿    8378.59\n",
       " 10  10  601939  建设银行    7697.36,\n",
       "      0       1     2         3\n",
       " 0   排名    股票代码  企业简称  净利润 （亿元）\n",
       " 1    1  601398  工商银行   3651.16\n",
       " 2    2  601939  建设银行   3324.60\n",
       " 3    3  601288  农业银行   2698.20\n",
       " 4    4  601988  中国银行   2463.71\n",
       " 5    5  601857  中国石油   1802.91\n",
       " 6    6  600036  招商银行   1480.06\n",
       " 7    7  600941  中国移动   1319.35\n",
       " 8    8  600938  中国海油    1240.9\n",
       " 9    9  601318  中国平安   1092.74\n",
       " 10  10  601328  交通银行    932.52,\n",
       "      0       1     2          3\n",
       " 0   排名    股票代码  企业简称  利润总额 （亿元）\n",
       " 1    1  601398  工商银行    4219.66\n",
       " 2    2  601939  建设银行    3893.77\n",
       " 3    3  601288  农业银行    3074.19\n",
       " 4    4  601988  中国银行    2956.08\n",
       " 5    5  601857  中国石油    2374.58\n",
       " 6    6  600036  招商银行    1766.18\n",
       " 7    7  600938  中国海油    1729.74\n",
       " 8    8  600941  中国移动    1705.31\n",
       " 9    9  601318  中国平安    1201.17\n",
       " 10  10  600519  贵州茅台    1036.63,\n",
       "     序号    股票代码  股票简称               公司名称  省份   城市 主营业务收入(202409) 净利润(202409)  \\\n",
       " 0    1       1  平安银行         平安银行股份有限公司  广东  深圳市       1115.82亿     397.29亿   \n",
       " 1    2    1227  兰州银行         兰州银行股份有限公司  甘肃  兰州市         60.53亿      15.33亿   \n",
       " 2    3    2142  宁波银行         宁波银行股份有限公司  浙江  宁波市        507.53亿     207.71亿   \n",
       " 3    4    2807  江阴银行   江苏江阴农村商业银行股份有限公司  江苏  无锡市         30.18亿      11.26亿   \n",
       " 4    5    2839  张家港行  江苏张家港农村商业银行股份有限公司  江苏  苏州市         36.33亿      14.95亿   \n",
       " 5    6    2936  郑州银行         郑州银行股份有限公司  河南  郑州市         90.41亿      22.89亿   \n",
       " 6    7    2948  青岛银行         青岛银行股份有限公司  山东  青岛市        104.86亿      35.60亿   \n",
       " 7    8    2958  青农商行     青岛农村商业银行股份有限公司  山东  青岛市         84.42亿      32.03亿   \n",
       " 8    9    2966  苏州银行         苏州银行股份有限公司  江苏  苏州市         92.89亿      43.30亿   \n",
       " 9   10  600000  浦发银行     上海浦东发展银行股份有限公司  上海  上海市       1298.39亿     356.87亿   \n",
       " 10  11  600015  华夏银行         华夏银行股份有限公司  北京  北京市        711.35亿     189.02亿   \n",
       " 11  12  600016  民生银行       中国民生银行股份有限公司  北京  北京市       1016.60亿     307.16亿   \n",
       " 12  13  600036  招商银行         招商银行股份有限公司  广东  深圳市       2527.09亿    1140.39亿   \n",
       " 13  14  600908  无锡银行     无锡农村商业银行股份有限公司  江苏  无锡市         36.25亿      17.70亿   \n",
       " 14  15  600919  江苏银行         江苏银行股份有限公司  江苏  南京市        623.03亿     292.96亿   \n",
       " 15  16  600926  杭州银行         杭州银行股份有限公司  浙江  杭州市        284.94亿     138.70亿   \n",
       " 16  17  600928  西安银行         西安银行股份有限公司  陕西  西安市         55.29亿      19.28亿   \n",
       " 17  18  601009  南京银行         南京银行股份有限公司  江苏  南京市        385.58亿     167.72亿   \n",
       " 18  19  601077  渝农商行     重庆农村商业银行股份有限公司  重庆  重庆市        215.14亿     105.74亿   \n",
       " 19  20  601128  常熟银行   江苏常熟农村商业银行股份有限公司  江苏  苏州市         83.70亿      32.10亿   \n",
       " \n",
       "       员工人数        上市日期  招股书  公司财报    行业分类  \\\n",
       " 0    40830  1991-04-03   --   NaN  股份制银行Ⅲ   \n",
       " 1     4279  2022-01-17  NaN   NaN    城商行Ⅲ   \n",
       " 2    27540  2007-07-19   --   NaN    城商行Ⅲ   \n",
       " 3     1752  2016-09-02  NaN   NaN    农商行Ⅲ   \n",
       " 4     2425  2017-01-24  NaN   NaN    农商行Ⅲ   \n",
       " 5     5822  2018-09-19  NaN   NaN    城商行Ⅲ   \n",
       " 6     5137  2019-01-16  NaN   NaN    城商行Ⅲ   \n",
       " 7     5710  2019-03-26  NaN   NaN    农商行Ⅲ   \n",
       " 8     5261  2019-08-02  NaN   NaN    城商行Ⅲ   \n",
       " 9    61892  1999-11-10   --   NaN  股份制银行Ⅲ   \n",
       " 10   38900  2003-09-12   --   NaN  股份制银行Ⅲ   \n",
       " 11   63071  2000-12-19   --   NaN  股份制银行Ⅲ   \n",
       " 12  115407  2002-04-09   --   NaN  股份制银行Ⅲ   \n",
       " 13    1748  2016-09-23  NaN   NaN    农商行Ⅲ   \n",
       " 14   19989  2016-08-02  NaN   NaN    城商行Ⅲ   \n",
       " 15   13877  2016-10-27  NaN   NaN    城商行Ⅲ   \n",
       " 16    3266  2019-03-01  NaN   NaN    城商行Ⅲ   \n",
       " 17   16260  2007-07-19   --   NaN    城商行Ⅲ   \n",
       " 18   14592  2019-10-29  NaN   NaN    农商行Ⅲ   \n",
       " 19    7376  2016-09-30  NaN   NaN    农商行Ⅲ   \n",
       " \n",
       "                                                  产品类型  \\\n",
       " 0                                              商业银行业务   \n",
       " 1                                  公司银行业务、个人银行业务、资金业务   \n",
       " 2                                   客户贷款及垫款、客户存款、同业拆入   \n",
       " 3             公司贷款、公司存款、中间业务及服务、国际业务、个人银行业务产品和服务、资金业务   \n",
       " 4              存放同业、存放中央银行、拆出资金、发放贷款及垫款、买入返售金融资产、金融投资   \n",
       " 5                                  公司银行业务、零售银行业务、资金业务   \n",
       " 6                                公司银行业务、零售银行业务、金融市场业务   \n",
       " 7                          公司贷款、公司存款、中间业务、国际贸易融资及结算服务   \n",
       " 8                                 公司业务、个人业务、资金业务、其他业务   \n",
       " 9                     公司金融业务、零售银行业务、金融市场与金融机构业务、渠道与服务   \n",
       " 10                                               金融服务   \n",
       " 11       银行业务、资金业务、融资租赁业务、基金及资产管理业务、投资银行业务及提供其他相关金融服务   \n",
       " 12                                      零售银行服务、批发银行服务   \n",
       " 13                                     公司业务、个人业务、资金业务   \n",
       " 14  发放贷款及垫款、债务工具投资、存放同业及其他金融机构款项、存放中央银行款项、长期应收款、买入...   \n",
       " 15                             大公司板块、大零售板块、大资管板块、渠道建设   \n",
       " 16                                 公司金融业务、个人金融业务、资金业务   \n",
       " 17                              商业银行业务、存款业务、贷款业务、中间业务   \n",
       " 18                               普惠金融业务、公司金融业务、金融市场业务   \n",
       " 19                        零售银行业务、公司银行业务、金融市场业务、村镇银行业务   \n",
       " \n",
       "                                                  主营业务  \n",
       " 0                                 经有关监管机构批准的各项商业银行业务。  \n",
       " 1   公司银行业务、零售银行业务、普惠金融业务、金融市场业务、理财业务、网络金融业务、金融科技、兰...  \n",
       " 2                对公及对私存款、贷款、支付结算、资金业务、并提供资产管理及其他金融业务。  \n",
       " 3          吸收公众存款;发放短期,中期和长期贷款;办理国内结算;办理票据承兑与贴现等金融业务。  \n",
       " 4                                          存贷款等金融类业务。  \n",
       " 5                                 公司银行业务、零售银行业务和资金业务。  \n",
       " 6                         向客户提供公司及个人存款、贷款、支付结算等服务和产品。  \n",
       " 7                            公司银行业务、零售银行业务、资金业务及国际业务。  \n",
       " 8                                公司业务、个人业务、资金业务及其他业务。  \n",
       " 9                                         提供银行及相关金融服务  \n",
       " 10       提供存款、贷款、投资银行、贸易金融、绿色金融和现金管理等专业化、特色化和综合化金融服务。  \n",
       " 11         从事公司及个人银行业务、资金业务、融资租赁业务、资产管理业务及提供其他相关金融服务。  \n",
       " 12                  向客户提供各种批发及零售银行产品和服务，亦自营及代客进行资金业务。  \n",
       " 13                       公司存款业务、公司贷款业务、公司结算业务、其他公司业务。  \n",
       " 14        公司和个人金融业务、支付结算业务、资金业务、投资银行业务、融资租赁业务及其他金融业务。  \n",
       " 15                     为城乡中、小企业客户及居民家庭提供全面的商业银行产品及服务。  \n",
       " 16                           公司及个人银行服务，资金业务及其他商业银行业务。  \n",
       " 17                          公司银行业务、个人银行业务、资金业务和其他业务等。  \n",
       " 18                              普惠金融业务、公司金融业务、金融市场业务。  \n",
       " 19                      零售银行业务、公司银行业务、金融市场业务以及村镇银行业务。  ]"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import akshare as ak\n",
    "html='https://s.askci.com/stock/a/Z273511320550425164-0?#QueryCondition' #银行行业\n",
    "tabs=pd.read_html(html)\n",
    "tabs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "b45053bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "dat=[]\n",
    "for i in range(1,6): \n",
    "    d= pd.read_html('https://s.askci.com/stock/a/Z273511320550425164-0?reportTime=2024-09-30&pageNum='+str(i)+'#QueryCondition')[3]\n",
    "    dat.append(d)\n",
    "data=pd.concat(dat,axis=0)\n",
    "data.to_excel('A股银行行业股票基本信息.xlsx',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "8f5159b7",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:15:44.924480Z",
     "start_time": "2024-06-05T02:15:44.878735Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(42, 15)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>序号</th>\n",
       "      <th>股票代码</th>\n",
       "      <th>股票简称</th>\n",
       "      <th>公司名称</th>\n",
       "      <th>省份</th>\n",
       "      <th>城市</th>\n",
       "      <th>主营业务收入(202409)</th>\n",
       "      <th>净利润(202409)</th>\n",
       "      <th>员工人数</th>\n",
       "      <th>上市日期</th>\n",
       "      <th>招股书</th>\n",
       "      <th>公司财报</th>\n",
       "      <th>行业分类</th>\n",
       "      <th>产品类型</th>\n",
       "      <th>主营业务</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>平安银行</td>\n",
       "      <td>平安银行股份有限公司</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>1115.82亿</td>\n",
       "      <td>397.29亿</td>\n",
       "      <td>40830</td>\n",
       "      <td>1991-04-03</td>\n",
       "      <td>--</td>\n",
       "      <td>NaN</td>\n",
       "      <td>股份制银行Ⅲ</td>\n",
       "      <td>商业银行业务</td>\n",
       "      <td>经有关监管机构批准的各项商业银行业务。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1227</td>\n",
       "      <td>兰州银行</td>\n",
       "      <td>兰州银行股份有限公司</td>\n",
       "      <td>甘肃</td>\n",
       "      <td>兰州市</td>\n",
       "      <td>60.53亿</td>\n",
       "      <td>15.33亿</td>\n",
       "      <td>4279</td>\n",
       "      <td>2022-01-17</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>城商行Ⅲ</td>\n",
       "      <td>公司银行业务、个人银行业务、资金业务</td>\n",
       "      <td>公司银行业务、零售银行业务、普惠金融业务、金融市场业务、理财业务、网络金融业务、金融科技、兰...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>2142</td>\n",
       "      <td>宁波银行</td>\n",
       "      <td>宁波银行股份有限公司</td>\n",
       "      <td>浙江</td>\n",
       "      <td>宁波市</td>\n",
       "      <td>507.53亿</td>\n",
       "      <td>207.71亿</td>\n",
       "      <td>27540</td>\n",
       "      <td>2007-07-19</td>\n",
       "      <td>--</td>\n",
       "      <td>NaN</td>\n",
       "      <td>城商行Ⅲ</td>\n",
       "      <td>客户贷款及垫款、客户存款、同业拆入</td>\n",
       "      <td>对公及对私存款、贷款、支付结算、资金业务、并提供资产管理及其他金融业务。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>2807</td>\n",
       "      <td>江阴银行</td>\n",
       "      <td>江苏江阴农村商业银行股份有限公司</td>\n",
       "      <td>江苏</td>\n",
       "      <td>无锡市</td>\n",
       "      <td>30.18亿</td>\n",
       "      <td>11.26亿</td>\n",
       "      <td>1752</td>\n",
       "      <td>2016-09-02</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>农商行Ⅲ</td>\n",
       "      <td>公司贷款、公司存款、中间业务及服务、国际业务、个人银行业务产品和服务、资金业务</td>\n",
       "      <td>吸收公众存款;发放短期,中期和长期贷款;办理国内结算;办理票据承兑与贴现等金融业务。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>2839</td>\n",
       "      <td>张家港行</td>\n",
       "      <td>江苏张家港农村商业银行股份有限公司</td>\n",
       "      <td>江苏</td>\n",
       "      <td>苏州市</td>\n",
       "      <td>36.33亿</td>\n",
       "      <td>14.95亿</td>\n",
       "      <td>2425</td>\n",
       "      <td>2017-01-24</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>农商行Ⅲ</td>\n",
       "      <td>存放同业、存放中央银行、拆出资金、发放贷款及垫款、买入返售金融资产、金融投资</td>\n",
       "      <td>存贷款等金融类业务。</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   序号  股票代码  股票简称               公司名称  省份   城市 主营业务收入(202409) 净利润(202409)  \\\n",
       "0   1     1  平安银行         平安银行股份有限公司  广东  深圳市       1115.82亿     397.29亿   \n",
       "1   2  1227  兰州银行         兰州银行股份有限公司  甘肃  兰州市         60.53亿      15.33亿   \n",
       "2   3  2142  宁波银行         宁波银行股份有限公司  浙江  宁波市        507.53亿     207.71亿   \n",
       "3   4  2807  江阴银行   江苏江阴农村商业银行股份有限公司  江苏  无锡市         30.18亿      11.26亿   \n",
       "4   5  2839  张家港行  江苏张家港农村商业银行股份有限公司  江苏  苏州市         36.33亿      14.95亿   \n",
       "\n",
       "    员工人数        上市日期  招股书  公司财报    行业分类  \\\n",
       "0  40830  1991-04-03   --   NaN  股份制银行Ⅲ   \n",
       "1   4279  2022-01-17  NaN   NaN    城商行Ⅲ   \n",
       "2  27540  2007-07-19   --   NaN    城商行Ⅲ   \n",
       "3   1752  2016-09-02  NaN   NaN    农商行Ⅲ   \n",
       "4   2425  2017-01-24  NaN   NaN    农商行Ⅲ   \n",
       "\n",
       "                                      产品类型  \\\n",
       "0                                   商业银行业务   \n",
       "1                       公司银行业务、个人银行业务、资金业务   \n",
       "2                        客户贷款及垫款、客户存款、同业拆入   \n",
       "3  公司贷款、公司存款、中间业务及服务、国际业务、个人银行业务产品和服务、资金业务   \n",
       "4   存放同业、存放中央银行、拆出资金、发放贷款及垫款、买入返售金融资产、金融投资   \n",
       "\n",
       "                                                主营业务  \n",
       "0                                经有关监管机构批准的各项商业银行业务。  \n",
       "1  公司银行业务、零售银行业务、普惠金融业务、金融市场业务、理财业务、网络金融业务、金融科技、兰...  \n",
       "2               对公及对私存款、贷款、支付结算、资金业务、并提供资产管理及其他金融业务。  \n",
       "3         吸收公众存款;发放短期,中期和长期贷款;办理国内结算;办理票据承兑与贴现等金融业务。  \n",
       "4                                         存贷款等金融类业务。  "
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data=pd.read_excel('A股银行行业股票基本信息.xlsx')\n",
    "print(data.shape)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "ae2a472b",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:15:45.028436Z",
     "start_time": "2024-06-05T02:15:44.927510Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>序号</th>\n",
       "      <th>股票代码</th>\n",
       "      <th>股票简称</th>\n",
       "      <th>公司名称</th>\n",
       "      <th>省份</th>\n",
       "      <th>城市</th>\n",
       "      <th>主营业务收入(202409)</th>\n",
       "      <th>净利润(202409)</th>\n",
       "      <th>员工人数</th>\n",
       "      <th>上市日期</th>\n",
       "      <th>招股书</th>\n",
       "      <th>公司财报</th>\n",
       "      <th>行业分类</th>\n",
       "      <th>产品类型</th>\n",
       "      <th>主营业务</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>000001</td>\n",
       "      <td>平安银行</td>\n",
       "      <td>平安银行股份有限公司</td>\n",
       "      <td>广东</td>\n",
       "      <td>深圳市</td>\n",
       "      <td>1115.82亿</td>\n",
       "      <td>397.29亿</td>\n",
       "      <td>40830</td>\n",
       "      <td>1991-04-03</td>\n",
       "      <td>--</td>\n",
       "      <td>NaN</td>\n",
       "      <td>股份制银行Ⅲ</td>\n",
       "      <td>商业银行业务</td>\n",
       "      <td>经有关监管机构批准的各项商业银行业务。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>001227</td>\n",
       "      <td>兰州银行</td>\n",
       "      <td>兰州银行股份有限公司</td>\n",
       "      <td>甘肃</td>\n",
       "      <td>兰州市</td>\n",
       "      <td>60.53亿</td>\n",
       "      <td>15.33亿</td>\n",
       "      <td>4279</td>\n",
       "      <td>2022-01-17</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>城商行Ⅲ</td>\n",
       "      <td>公司银行业务、个人银行业务、资金业务</td>\n",
       "      <td>公司银行业务、零售银行业务、普惠金融业务、金融市场业务、理财业务、网络金融业务、金融科技、兰...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>002142</td>\n",
       "      <td>宁波银行</td>\n",
       "      <td>宁波银行股份有限公司</td>\n",
       "      <td>浙江</td>\n",
       "      <td>宁波市</td>\n",
       "      <td>507.53亿</td>\n",
       "      <td>207.71亿</td>\n",
       "      <td>27540</td>\n",
       "      <td>2007-07-19</td>\n",
       "      <td>--</td>\n",
       "      <td>NaN</td>\n",
       "      <td>城商行Ⅲ</td>\n",
       "      <td>客户贷款及垫款、客户存款、同业拆入</td>\n",
       "      <td>对公及对私存款、贷款、支付结算、资金业务、并提供资产管理及其他金融业务。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>002807</td>\n",
       "      <td>江阴银行</td>\n",
       "      <td>江苏江阴农村商业银行股份有限公司</td>\n",
       "      <td>江苏</td>\n",
       "      <td>无锡市</td>\n",
       "      <td>30.18亿</td>\n",
       "      <td>11.26亿</td>\n",
       "      <td>1752</td>\n",
       "      <td>2016-09-02</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>农商行Ⅲ</td>\n",
       "      <td>公司贷款、公司存款、中间业务及服务、国际业务、个人银行业务产品和服务、资金业务</td>\n",
       "      <td>吸收公众存款;发放短期,中期和长期贷款;办理国内结算;办理票据承兑与贴现等金融业务。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>002839</td>\n",
       "      <td>张家港行</td>\n",
       "      <td>江苏张家港农村商业银行股份有限公司</td>\n",
       "      <td>江苏</td>\n",
       "      <td>苏州市</td>\n",
       "      <td>36.33亿</td>\n",
       "      <td>14.95亿</td>\n",
       "      <td>2425</td>\n",
       "      <td>2017-01-24</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>农商行Ⅲ</td>\n",
       "      <td>存放同业、存放中央银行、拆出资金、发放贷款及垫款、买入返售金融资产、金融投资</td>\n",
       "      <td>存贷款等金融类业务。</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   序号    股票代码  股票简称               公司名称  省份   城市 主营业务收入(202409) 净利润(202409)  \\\n",
       "0   1  000001  平安银行         平安银行股份有限公司  广东  深圳市       1115.82亿     397.29亿   \n",
       "1   2  001227  兰州银行         兰州银行股份有限公司  甘肃  兰州市         60.53亿      15.33亿   \n",
       "2   3  002142  宁波银行         宁波银行股份有限公司  浙江  宁波市        507.53亿     207.71亿   \n",
       "3   4  002807  江阴银行   江苏江阴农村商业银行股份有限公司  江苏  无锡市         30.18亿      11.26亿   \n",
       "4   5  002839  张家港行  江苏张家港农村商业银行股份有限公司  江苏  苏州市         36.33亿      14.95亿   \n",
       "\n",
       "    员工人数        上市日期  招股书  公司财报    行业分类  \\\n",
       "0  40830  1991-04-03   --   NaN  股份制银行Ⅲ   \n",
       "1   4279  2022-01-17  NaN   NaN    城商行Ⅲ   \n",
       "2  27540  2007-07-19   --   NaN    城商行Ⅲ   \n",
       "3   1752  2016-09-02  NaN   NaN    农商行Ⅲ   \n",
       "4   2425  2017-01-24  NaN   NaN    农商行Ⅲ   \n",
       "\n",
       "                                      产品类型  \\\n",
       "0                                   商业银行业务   \n",
       "1                       公司银行业务、个人银行业务、资金业务   \n",
       "2                        客户贷款及垫款、客户存款、同业拆入   \n",
       "3  公司贷款、公司存款、中间业务及服务、国际业务、个人银行业务产品和服务、资金业务   \n",
       "4   存放同业、存放中央银行、拆出资金、发放贷款及垫款、买入返售金融资产、金融投资   \n",
       "\n",
       "                                                主营业务  \n",
       "0                                经有关监管机构批准的各项商业银行业务。  \n",
       "1  公司银行业务、零售银行业务、普惠金融业务、金融市场业务、理财业务、网络金融业务、金融科技、兰...  \n",
       "2               对公及对私存款、贷款、支付结算、资金业务、并提供资产管理及其他金融业务。  \n",
       "3         吸收公众存款;发放短期,中期和长期贷款;办理国内结算;办理票据承兑与贴现等金融业务。  \n",
       "4                                         存贷款等金融类业务。  "
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def fill_zero(x):\n",
    "    x=str(x) #将x转换为字符型\n",
    "    xx=x.zfill(6)#通过0补全6位\n",
    "    return(xx)\n",
    "data['股票代码']=data['股票代码'].apply(fill_zero)# 批量应用补全函数\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "8de5675d",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:16:29.790990Z",
     "start_time": "2024-06-05T02:15:45.029471Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(10158, 13)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>日期</th>\n",
       "      <th>股票代码</th>\n",
       "      <th>开盘</th>\n",
       "      <th>收盘</th>\n",
       "      <th>最高</th>\n",
       "      <th>最低</th>\n",
       "      <th>成交量</th>\n",
       "      <th>成交额</th>\n",
       "      <th>振幅</th>\n",
       "      <th>涨跌幅</th>\n",
       "      <th>涨跌额</th>\n",
       "      <th>换手率</th>\n",
       "      <th>股票简称</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2023-01-03</td>\n",
       "      <td>000001</td>\n",
       "      <td>13.20</td>\n",
       "      <td>13.77</td>\n",
       "      <td>13.85</td>\n",
       "      <td>13.05</td>\n",
       "      <td>2194128</td>\n",
       "      <td>2.971547e+09</td>\n",
       "      <td>6.08</td>\n",
       "      <td>4.64</td>\n",
       "      <td>0.61</td>\n",
       "      <td>1.13</td>\n",
       "      <td>平安银行</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2023-01-04</td>\n",
       "      <td>000001</td>\n",
       "      <td>13.71</td>\n",
       "      <td>14.32</td>\n",
       "      <td>14.42</td>\n",
       "      <td>13.63</td>\n",
       "      <td>2189683</td>\n",
       "      <td>3.110729e+09</td>\n",
       "      <td>5.74</td>\n",
       "      <td>3.99</td>\n",
       "      <td>0.55</td>\n",
       "      <td>1.13</td>\n",
       "      <td>平安银行</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2023-01-05</td>\n",
       "      <td>000001</td>\n",
       "      <td>14.40</td>\n",
       "      <td>14.48</td>\n",
       "      <td>14.74</td>\n",
       "      <td>14.37</td>\n",
       "      <td>1665425</td>\n",
       "      <td>2.417272e+09</td>\n",
       "      <td>2.58</td>\n",
       "      <td>1.12</td>\n",
       "      <td>0.16</td>\n",
       "      <td>0.86</td>\n",
       "      <td>平安银行</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2023-01-06</td>\n",
       "      <td>000001</td>\n",
       "      <td>14.50</td>\n",
       "      <td>14.62</td>\n",
       "      <td>14.72</td>\n",
       "      <td>14.48</td>\n",
       "      <td>1195745</td>\n",
       "      <td>1.747915e+09</td>\n",
       "      <td>1.66</td>\n",
       "      <td>0.97</td>\n",
       "      <td>0.14</td>\n",
       "      <td>0.62</td>\n",
       "      <td>平安银行</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2023-01-09</td>\n",
       "      <td>000001</td>\n",
       "      <td>14.75</td>\n",
       "      <td>14.80</td>\n",
       "      <td>14.88</td>\n",
       "      <td>14.52</td>\n",
       "      <td>1057659</td>\n",
       "      <td>1.561368e+09</td>\n",
       "      <td>2.46</td>\n",
       "      <td>1.23</td>\n",
       "      <td>0.18</td>\n",
       "      <td>0.55</td>\n",
       "      <td>平安银行</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           日期    股票代码     开盘     收盘     最高     最低      成交量           成交额  \\\n",
       "0  2023-01-03  000001  13.20  13.77  13.85  13.05  2194128  2.971547e+09   \n",
       "1  2023-01-04  000001  13.71  14.32  14.42  13.63  2189683  3.110729e+09   \n",
       "2  2023-01-05  000001  14.40  14.48  14.74  14.37  1665425  2.417272e+09   \n",
       "3  2023-01-06  000001  14.50  14.62  14.72  14.48  1195745  1.747915e+09   \n",
       "4  2023-01-09  000001  14.75  14.80  14.88  14.52  1057659  1.561368e+09   \n",
       "\n",
       "     振幅   涨跌幅   涨跌额   换手率  股票简称  \n",
       "0  6.08  4.64  0.61  1.13  平安银行  \n",
       "1  5.74  3.99  0.55  1.13  平安银行  \n",
       "2  2.58  1.12  0.16  0.86  平安银行  \n",
       "3  1.66  0.97  0.14  0.62  平安银行  \n",
       "4  2.46  1.23  0.18  0.55  平安银行  "
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import akshare as ak\n",
    "data_fi=[]\n",
    "for i in range(data.shape[0]):\n",
    "    result=ak.stock_zh_a_hist(symbol=data['股票代码'][i], start_date='20230101',end_date='20231231')\n",
    "    result['股票代码']=data['股票代码'][i]\n",
    "    result['股票简称']=data['股票简称'][i]\n",
    "    result.to_excel(str(data['股票代码'][i])+'.xlsx',index=False)\n",
    "    data_fi.append(result)\n",
    "data_fina=pd.concat(data_fi,axis=0)\n",
    "print(data_fina.shape)\n",
    "data_fina.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "dcffd0a0",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:16:31.138120Z",
     "start_time": "2024-06-05T02:16:29.792044Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>日期</th>\n",
       "      <th>股票代码</th>\n",
       "      <th>开盘</th>\n",
       "      <th>收盘</th>\n",
       "      <th>最高</th>\n",
       "      <th>最低</th>\n",
       "      <th>成交量</th>\n",
       "      <th>成交额</th>\n",
       "      <th>振幅</th>\n",
       "      <th>涨跌幅</th>\n",
       "      <th>涨跌额</th>\n",
       "      <th>换手率</th>\n",
       "      <th>股票简称</th>\n",
       "      <th>年</th>\n",
       "      <th>月</th>\n",
       "      <th>日</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2023-01-03</td>\n",
       "      <td>000001</td>\n",
       "      <td>13.20</td>\n",
       "      <td>13.77</td>\n",
       "      <td>13.85</td>\n",
       "      <td>13.05</td>\n",
       "      <td>2194128</td>\n",
       "      <td>2.971547e+09</td>\n",
       "      <td>6.08</td>\n",
       "      <td>4.64</td>\n",
       "      <td>0.61</td>\n",
       "      <td>1.13</td>\n",
       "      <td>平安银行</td>\n",
       "      <td>2023</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2023-01-04</td>\n",
       "      <td>000001</td>\n",
       "      <td>13.71</td>\n",
       "      <td>14.32</td>\n",
       "      <td>14.42</td>\n",
       "      <td>13.63</td>\n",
       "      <td>2189683</td>\n",
       "      <td>3.110729e+09</td>\n",
       "      <td>5.74</td>\n",
       "      <td>3.99</td>\n",
       "      <td>0.55</td>\n",
       "      <td>1.13</td>\n",
       "      <td>平安银行</td>\n",
       "      <td>2023</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2023-01-05</td>\n",
       "      <td>000001</td>\n",
       "      <td>14.40</td>\n",
       "      <td>14.48</td>\n",
       "      <td>14.74</td>\n",
       "      <td>14.37</td>\n",
       "      <td>1665425</td>\n",
       "      <td>2.417272e+09</td>\n",
       "      <td>2.58</td>\n",
       "      <td>1.12</td>\n",
       "      <td>0.16</td>\n",
       "      <td>0.86</td>\n",
       "      <td>平安银行</td>\n",
       "      <td>2023</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2023-01-06</td>\n",
       "      <td>000001</td>\n",
       "      <td>14.50</td>\n",
       "      <td>14.62</td>\n",
       "      <td>14.72</td>\n",
       "      <td>14.48</td>\n",
       "      <td>1195745</td>\n",
       "      <td>1.747915e+09</td>\n",
       "      <td>1.66</td>\n",
       "      <td>0.97</td>\n",
       "      <td>0.14</td>\n",
       "      <td>0.62</td>\n",
       "      <td>平安银行</td>\n",
       "      <td>2023</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2023-01-09</td>\n",
       "      <td>000001</td>\n",
       "      <td>14.75</td>\n",
       "      <td>14.80</td>\n",
       "      <td>14.88</td>\n",
       "      <td>14.52</td>\n",
       "      <td>1057659</td>\n",
       "      <td>1.561368e+09</td>\n",
       "      <td>2.46</td>\n",
       "      <td>1.23</td>\n",
       "      <td>0.18</td>\n",
       "      <td>0.55</td>\n",
       "      <td>平安银行</td>\n",
       "      <td>2023</td>\n",
       "      <td>1</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           日期    股票代码     开盘     收盘     最高     最低      成交量           成交额  \\\n",
       "0  2023-01-03  000001  13.20  13.77  13.85  13.05  2194128  2.971547e+09   \n",
       "1  2023-01-04  000001  13.71  14.32  14.42  13.63  2189683  3.110729e+09   \n",
       "2  2023-01-05  000001  14.40  14.48  14.74  14.37  1665425  2.417272e+09   \n",
       "3  2023-01-06  000001  14.50  14.62  14.72  14.48  1195745  1.747915e+09   \n",
       "4  2023-01-09  000001  14.75  14.80  14.88  14.52  1057659  1.561368e+09   \n",
       "\n",
       "     振幅   涨跌幅   涨跌额   换手率  股票简称     年  月  日  \n",
       "0  6.08  4.64  0.61  1.13  平安银行  2023  1  3  \n",
       "1  5.74  3.99  0.55  1.13  平安银行  2023  1  4  \n",
       "2  2.58  1.12  0.16  0.86  平安银行  2023  1  5  \n",
       "3  1.66  0.97  0.14  0.62  平安银行  2023  1  6  \n",
       "4  2.46  1.23  0.18  0.55  平安银行  2023  1  9  "
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as  pd\n",
    "#  将日期字符串转换为datetime对象\n",
    "date_series=data_fina['日期'].apply(pd.to_datetime)\n",
    "# 提取年月日\n",
    "data_fina['年']=date_series.dt.year\n",
    "data_fina['月']=date_series.dt.month\n",
    "data_fina['日']=date_series.dt.day\n",
    "data_fina.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "13b078b0",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:16:36.524589Z",
     "start_time": "2024-06-05T02:16:31.139119Z"
    }
   },
   "outputs": [],
   "source": [
    "data_fina.to_excel('A股中银行行业的股票历史行情数据_预处理.xlsx',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f9ed11d",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "a6c48d7e",
   "metadata": {},
   "source": [
    "# 实战演练——新浪财经历史分红数据的获取与预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "79b0ecdc",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:16:36.868584Z",
     "start_time": "2024-06-05T02:16:36.525588Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>代码</th>\n",
       "      <th>名称</th>\n",
       "      <th>上市日期</th>\n",
       "      <th>累计股息(%)</th>\n",
       "      <th>年均股息(%)</th>\n",
       "      <th>分红次数</th>\n",
       "      <th>融资总额(亿)</th>\n",
       "      <th>融资次数</th>\n",
       "      <th>详细</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>550</td>\n",
       "      <td>江铃汽车</td>\n",
       "      <td>1993-12-01</td>\n",
       "      <td>213.0</td>\n",
       "      <td>6.87</td>\n",
       "      <td>51</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>541</td>\n",
       "      <td>佛山照明</td>\n",
       "      <td>1993-11-23</td>\n",
       "      <td>192.0</td>\n",
       "      <td>6.19</td>\n",
       "      <td>56</td>\n",
       "      <td>10.8842</td>\n",
       "      <td>1</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>429</td>\n",
       "      <td>粤高速A</td>\n",
       "      <td>1998-02-20</td>\n",
       "      <td>171.4</td>\n",
       "      <td>6.35</td>\n",
       "      <td>50</td>\n",
       "      <td>16.3350</td>\n",
       "      <td>1</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>726</td>\n",
       "      <td>鲁泰A</td>\n",
       "      <td>2000-12-25</td>\n",
       "      <td>148.7</td>\n",
       "      <td>6.20</td>\n",
       "      <td>51</td>\n",
       "      <td>9.5082</td>\n",
       "      <td>1</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>581</td>\n",
       "      <td>威孚高科</td>\n",
       "      <td>1998-09-24</td>\n",
       "      <td>145.7</td>\n",
       "      <td>5.60</td>\n",
       "      <td>50</td>\n",
       "      <td>28.5012</td>\n",
       "      <td>1</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>22</td>\n",
       "      <td>深赤湾A</td>\n",
       "      <td>1993-05-05</td>\n",
       "      <td>132.3</td>\n",
       "      <td>5.09</td>\n",
       "      <td>45</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>539</td>\n",
       "      <td>粤电力A</td>\n",
       "      <td>1993-11-26</td>\n",
       "      <td>130.0</td>\n",
       "      <td>4.19</td>\n",
       "      <td>53</td>\n",
       "      <td>8.0981</td>\n",
       "      <td>1</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>488</td>\n",
       "      <td>ST晨鸣</td>\n",
       "      <td>2000-11-20</td>\n",
       "      <td>123.7</td>\n",
       "      <td>5.15</td>\n",
       "      <td>40</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>761</td>\n",
       "      <td>本钢板材</td>\n",
       "      <td>1998-01-15</td>\n",
       "      <td>123.6</td>\n",
       "      <td>4.58</td>\n",
       "      <td>38</td>\n",
       "      <td>39.6774</td>\n",
       "      <td>1</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>12</td>\n",
       "      <td>南玻A</td>\n",
       "      <td>1992-02-28</td>\n",
       "      <td>114.1</td>\n",
       "      <td>3.46</td>\n",
       "      <td>57</td>\n",
       "      <td>13.7200</td>\n",
       "      <td>1</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>600066</td>\n",
       "      <td>宇通客车</td>\n",
       "      <td>1997-05-08</td>\n",
       "      <td>113.2</td>\n",
       "      <td>4.04</td>\n",
       "      <td>27</td>\n",
       "      <td>21.8949</td>\n",
       "      <td>1</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>600177</td>\n",
       "      <td>雅戈尔</td>\n",
       "      <td>1998-11-19</td>\n",
       "      <td>108.0</td>\n",
       "      <td>4.15</td>\n",
       "      <td>29</td>\n",
       "      <td>49.4842</td>\n",
       "      <td>1</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>2</td>\n",
       "      <td>万科A</td>\n",
       "      <td>1991-01-29</td>\n",
       "      <td>104.8</td>\n",
       "      <td>3.08</td>\n",
       "      <td>49</td>\n",
       "      <td>141.3330</td>\n",
       "      <td>2</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>600019</td>\n",
       "      <td>宝钢股份</td>\n",
       "      <td>2000-12-12</td>\n",
       "      <td>98.0</td>\n",
       "      <td>4.09</td>\n",
       "      <td>28</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>600377</td>\n",
       "      <td>宁沪高速</td>\n",
       "      <td>2001-01-16</td>\n",
       "      <td>96.0</td>\n",
       "      <td>4.00</td>\n",
       "      <td>24</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>600028</td>\n",
       "      <td>中国石化</td>\n",
       "      <td>2001-08-08</td>\n",
       "      <td>95.3</td>\n",
       "      <td>3.97</td>\n",
       "      <td>46</td>\n",
       "      <td>119.8730</td>\n",
       "      <td>1</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>651</td>\n",
       "      <td>格力电器</td>\n",
       "      <td>1996-11-18</td>\n",
       "      <td>95.0</td>\n",
       "      <td>3.39</td>\n",
       "      <td>29</td>\n",
       "      <td>43.3065</td>\n",
       "      <td>2</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>600282</td>\n",
       "      <td>南钢股份</td>\n",
       "      <td>2000-09-19</td>\n",
       "      <td>92.9</td>\n",
       "      <td>3.87</td>\n",
       "      <td>22</td>\n",
       "      <td>19.5001</td>\n",
       "      <td>2</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>600153</td>\n",
       "      <td>建发股份</td>\n",
       "      <td>1998-06-16</td>\n",
       "      <td>92.1</td>\n",
       "      <td>3.41</td>\n",
       "      <td>27</td>\n",
       "      <td>45.0896</td>\n",
       "      <td>2</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>601006</td>\n",
       "      <td>大秦铁路</td>\n",
       "      <td>2006-08-01</td>\n",
       "      <td>91.1</td>\n",
       "      <td>4.80</td>\n",
       "      <td>19</td>\n",
       "      <td>309.5330</td>\n",
       "      <td>2</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>895</td>\n",
       "      <td>双汇发展</td>\n",
       "      <td>1998-12-10</td>\n",
       "      <td>88.6</td>\n",
       "      <td>3.41</td>\n",
       "      <td>30</td>\n",
       "      <td>69.6768</td>\n",
       "      <td>1</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>30</td>\n",
       "      <td>富奥股份</td>\n",
       "      <td>1993-09-29</td>\n",
       "      <td>88.3</td>\n",
       "      <td>2.85</td>\n",
       "      <td>26</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>600507</td>\n",
       "      <td>方大特钢</td>\n",
       "      <td>2003-09-30</td>\n",
       "      <td>88.2</td>\n",
       "      <td>4.20</td>\n",
       "      <td>15</td>\n",
       "      <td>8.7208</td>\n",
       "      <td>1</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>601088</td>\n",
       "      <td>中国神华</td>\n",
       "      <td>2007-10-09</td>\n",
       "      <td>87.5</td>\n",
       "      <td>5.15</td>\n",
       "      <td>18</td>\n",
       "      <td>659.8840</td>\n",
       "      <td>1</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>600012</td>\n",
       "      <td>皖通高速</td>\n",
       "      <td>2003-01-07</td>\n",
       "      <td>86.0</td>\n",
       "      <td>3.91</td>\n",
       "      <td>23</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>601988</td>\n",
       "      <td>中国银行</td>\n",
       "      <td>2006-07-05</td>\n",
       "      <td>85.7</td>\n",
       "      <td>4.51</td>\n",
       "      <td>19</td>\n",
       "      <td>610.9070</td>\n",
       "      <td>2</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>600642</td>\n",
       "      <td>申能股份</td>\n",
       "      <td>1993-04-16</td>\n",
       "      <td>85.4</td>\n",
       "      <td>2.67</td>\n",
       "      <td>31</td>\n",
       "      <td>53.0172</td>\n",
       "      <td>3</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>11</td>\n",
       "      <td>深物业A</td>\n",
       "      <td>1992-03-30</td>\n",
       "      <td>84.2</td>\n",
       "      <td>2.55</td>\n",
       "      <td>31</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>601398</td>\n",
       "      <td>工商银行</td>\n",
       "      <td>2006-10-27</td>\n",
       "      <td>84.2</td>\n",
       "      <td>4.68</td>\n",
       "      <td>19</td>\n",
       "      <td>791.5700</td>\n",
       "      <td>2</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>531</td>\n",
       "      <td>穗恒运A</td>\n",
       "      <td>1994-01-06</td>\n",
       "      <td>84.1</td>\n",
       "      <td>2.71</td>\n",
       "      <td>27</td>\n",
       "      <td>13.4607</td>\n",
       "      <td>1</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>26</td>\n",
       "      <td>飞亚达</td>\n",
       "      <td>1993-06-03</td>\n",
       "      <td>83.5</td>\n",
       "      <td>2.61</td>\n",
       "      <td>48</td>\n",
       "      <td>10.6569</td>\n",
       "      <td>2</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>937</td>\n",
       "      <td>冀中能源</td>\n",
       "      <td>1999-09-09</td>\n",
       "      <td>83.5</td>\n",
       "      <td>3.34</td>\n",
       "      <td>26</td>\n",
       "      <td>30.9075</td>\n",
       "      <td>1</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>601939</td>\n",
       "      <td>建设银行</td>\n",
       "      <td>2007-09-25</td>\n",
       "      <td>83.0</td>\n",
       "      <td>4.88</td>\n",
       "      <td>19</td>\n",
       "      <td>593.4432</td>\n",
       "      <td>2</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>600188</td>\n",
       "      <td>兖矿能源</td>\n",
       "      <td>1998-07-01</td>\n",
       "      <td>82.0</td>\n",
       "      <td>3.04</td>\n",
       "      <td>32</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>600104</td>\n",
       "      <td>上汽集团</td>\n",
       "      <td>1997-11-25</td>\n",
       "      <td>81.9</td>\n",
       "      <td>3.03</td>\n",
       "      <td>24</td>\n",
       "      <td>247.3350</td>\n",
       "      <td>2</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>869</td>\n",
       "      <td>张裕A</td>\n",
       "      <td>2000-10-26</td>\n",
       "      <td>81.6</td>\n",
       "      <td>3.40</td>\n",
       "      <td>47</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>600548</td>\n",
       "      <td>深高速</td>\n",
       "      <td>2001-12-25</td>\n",
       "      <td>80.6</td>\n",
       "      <td>3.51</td>\n",
       "      <td>23</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>530</td>\n",
       "      <td>冰山冷热</td>\n",
       "      <td>1993-12-08</td>\n",
       "      <td>80.1</td>\n",
       "      <td>2.58</td>\n",
       "      <td>54</td>\n",
       "      <td>5.6128</td>\n",
       "      <td>1</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>601328</td>\n",
       "      <td>交通银行</td>\n",
       "      <td>2007-05-15</td>\n",
       "      <td>79.8</td>\n",
       "      <td>4.44</td>\n",
       "      <td>21</td>\n",
       "      <td>714.1740</td>\n",
       "      <td>3</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>600350</td>\n",
       "      <td>山东高速</td>\n",
       "      <td>2002-03-18</td>\n",
       "      <td>79.6</td>\n",
       "      <td>3.46</td>\n",
       "      <td>22</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>详细</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        代码    名称        上市日期  累计股息(%)  年均股息(%)  分红次数   融资总额(亿)  融资次数  详细\n",
       "0      550  江铃汽车  1993-12-01    213.0     6.87    51    0.0000     0  详细\n",
       "1      541  佛山照明  1993-11-23    192.0     6.19    56   10.8842     1  详细\n",
       "2      429  粤高速A  1998-02-20    171.4     6.35    50   16.3350     1  详细\n",
       "3      726   鲁泰A  2000-12-25    148.7     6.20    51    9.5082     1  详细\n",
       "4      581  威孚高科  1998-09-24    145.7     5.60    50   28.5012     1  详细\n",
       "5       22  深赤湾A  1993-05-05    132.3     5.09    45    0.0000     0  详细\n",
       "6      539  粤电力A  1993-11-26    130.0     4.19    53    8.0981     1  详细\n",
       "7      488  ST晨鸣  2000-11-20    123.7     5.15    40    0.0000     0  详细\n",
       "8      761  本钢板材  1998-01-15    123.6     4.58    38   39.6774     1  详细\n",
       "9       12   南玻A  1992-02-28    114.1     3.46    57   13.7200     1  详细\n",
       "10  600066  宇通客车  1997-05-08    113.2     4.04    27   21.8949     1  详细\n",
       "11  600177   雅戈尔  1998-11-19    108.0     4.15    29   49.4842     1  详细\n",
       "12       2   万科A  1991-01-29    104.8     3.08    49  141.3330     2  详细\n",
       "13  600019  宝钢股份  2000-12-12     98.0     4.09    28    0.0000     0  详细\n",
       "14  600377  宁沪高速  2001-01-16     96.0     4.00    24    0.0000     0  详细\n",
       "15  600028  中国石化  2001-08-08     95.3     3.97    46  119.8730     1  详细\n",
       "16     651  格力电器  1996-11-18     95.0     3.39    29   43.3065     2  详细\n",
       "17  600282  南钢股份  2000-09-19     92.9     3.87    22   19.5001     2  详细\n",
       "18  600153  建发股份  1998-06-16     92.1     3.41    27   45.0896     2  详细\n",
       "19  601006  大秦铁路  2006-08-01     91.1     4.80    19  309.5330     2  详细\n",
       "20     895  双汇发展  1998-12-10     88.6     3.41    30   69.6768     1  详细\n",
       "21      30  富奥股份  1993-09-29     88.3     2.85    26    0.0000     0  详细\n",
       "22  600507  方大特钢  2003-09-30     88.2     4.20    15    8.7208     1  详细\n",
       "23  601088  中国神华  2007-10-09     87.5     5.15    18  659.8840     1  详细\n",
       "24  600012  皖通高速  2003-01-07     86.0     3.91    23    0.0000     0  详细\n",
       "25  601988  中国银行  2006-07-05     85.7     4.51    19  610.9070     2  详细\n",
       "26  600642  申能股份  1993-04-16     85.4     2.67    31   53.0172     3  详细\n",
       "27      11  深物业A  1992-03-30     84.2     2.55    31    0.0000     0  详细\n",
       "28  601398  工商银行  2006-10-27     84.2     4.68    19  791.5700     2  详细\n",
       "29     531  穗恒运A  1994-01-06     84.1     2.71    27   13.4607     1  详细\n",
       "30      26   飞亚达  1993-06-03     83.5     2.61    48   10.6569     2  详细\n",
       "31     937  冀中能源  1999-09-09     83.5     3.34    26   30.9075     1  详细\n",
       "32  601939  建设银行  2007-09-25     83.0     4.88    19  593.4432     2  详细\n",
       "33  600188  兖矿能源  1998-07-01     82.0     3.04    32    0.0000     0  详细\n",
       "34  600104  上汽集团  1997-11-25     81.9     3.03    24  247.3350     2  详细\n",
       "35     869   张裕A  2000-10-26     81.6     3.40    47    0.0000     0  详细\n",
       "36  600548   深高速  2001-12-25     80.6     3.51    23    0.0000     0  详细\n",
       "37     530  冰山冷热  1993-12-08     80.1     2.58    54    5.6128     1  详细\n",
       "38  601328  交通银行  2007-05-15     79.8     4.44    21  714.1740     3  详细\n",
       "39  600350  山东高速  2002-03-18     79.6     3.46    22    0.0000     0  详细"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "html=\"http://vip.stock.finance.sina.com.cn/q/go.php/vInvestConsult/kind/lsfh/index.phtml\"\n",
    "tabs=pd.read_html(html,encoding=\"gbk\")\n",
    "tabs[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "c9de080d",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:16:51.004988Z",
     "start_time": "2024-06-05T02:16:36.870113Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "dat=[]\n",
    "for i in range(1,100): \n",
    "    d=pd.read_html(\"http://vip.stock.finance.sina.com.cn/q/go.php/vInvestConsult/kind/lsfh/index.phtml?p=\"+str(i),encoding=\"gbk\")[0]\n",
    "    dat.append(d)\n",
    "data=pd.concat(dat,axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "f3d2ca47",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:16:51.609735Z",
     "start_time": "2024-06-05T02:16:51.007989Z"
    }
   },
   "outputs": [],
   "source": [
    "data.to_excel(\"新浪财经历史分红数据.xlsx\",index=False) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "f611b494",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:16:52.334802Z",
     "start_time": "2024-06-05T02:16:51.610734Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数据形状： (3960, 9)\n",
      "代码去重后个数： 3618\n",
      "名称去重后个数： 3617\n",
      "详细去重后个数： 1\n"
     ]
    }
   ],
   "source": [
    "data=pd.read_excel(\"新浪财经历史分红数据.xlsx\")\n",
    "print(\"数据形状：\",data.shape)\n",
    "print(\"代码去重后个数：\",len(data['代码'].drop_duplicates()))\n",
    "print(\"名称去重后个数：\",len(data['名称'].drop_duplicates()))\n",
    "print(\"详细去重后个数：\",len(data['详细'].drop_duplicates()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "279dd765",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:16:52.341633Z",
     "start_time": "2024-06-05T02:16:52.336799Z"
    }
   },
   "outputs": [],
   "source": [
    "data=data.drop([\"详细\"],axis=1)#删除列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "6bc6c0b1",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:16:52.555980Z",
     "start_time": "2024-06-05T02:16:52.343663Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>代码</th>\n",
       "      <th>名称</th>\n",
       "      <th>上市日期</th>\n",
       "      <th>累计股息(%)</th>\n",
       "      <th>年均股息(%)</th>\n",
       "      <th>分红次数</th>\n",
       "      <th>融资总额(亿)</th>\n",
       "      <th>融资次数</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>550</td>\n",
       "      <td>江铃汽车</td>\n",
       "      <td>1993-12-01</td>\n",
       "      <td>213.0</td>\n",
       "      <td>6.87</td>\n",
       "      <td>51</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>541</td>\n",
       "      <td>佛山照明</td>\n",
       "      <td>1993-11-23</td>\n",
       "      <td>192.0</td>\n",
       "      <td>6.19</td>\n",
       "      <td>56</td>\n",
       "      <td>10.8842</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>429</td>\n",
       "      <td>粤高速A</td>\n",
       "      <td>1998-02-20</td>\n",
       "      <td>171.4</td>\n",
       "      <td>6.35</td>\n",
       "      <td>50</td>\n",
       "      <td>16.3350</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>726</td>\n",
       "      <td>鲁泰A</td>\n",
       "      <td>2000-12-25</td>\n",
       "      <td>148.7</td>\n",
       "      <td>6.20</td>\n",
       "      <td>51</td>\n",
       "      <td>9.5082</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>581</td>\n",
       "      <td>威孚高科</td>\n",
       "      <td>1998-09-24</td>\n",
       "      <td>145.7</td>\n",
       "      <td>5.60</td>\n",
       "      <td>50</td>\n",
       "      <td>28.5012</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    代码    名称        上市日期  累计股息(%)  年均股息(%)  分红次数  融资总额(亿)  融资次数\n",
       "0  550  江铃汽车  1993-12-01    213.0     6.87    51   0.0000     0\n",
       "1  541  佛山照明  1993-11-23    192.0     6.19    56  10.8842     1\n",
       "2  429  粤高速A  1998-02-20    171.4     6.35    50  16.3350     1\n",
       "3  726   鲁泰A  2000-12-25    148.7     6.20    51   9.5082     1\n",
       "4  581  威孚高科  1998-09-24    145.7     5.60    50  28.5012     1"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "c96aeeed",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:16:52.757242Z",
     "start_time": "2024-06-05T02:16:52.557999Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>代码</th>\n",
       "      <th>名称</th>\n",
       "      <th>上市日期</th>\n",
       "      <th>累计股息(%)</th>\n",
       "      <th>年均股息(%)</th>\n",
       "      <th>分红次数</th>\n",
       "      <th>融资总额(亿)</th>\n",
       "      <th>融资次数</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>000550</td>\n",
       "      <td>江铃汽车</td>\n",
       "      <td>1993-12-01</td>\n",
       "      <td>213.0</td>\n",
       "      <td>6.87</td>\n",
       "      <td>51</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>000541</td>\n",
       "      <td>佛山照明</td>\n",
       "      <td>1993-11-23</td>\n",
       "      <td>192.0</td>\n",
       "      <td>6.19</td>\n",
       "      <td>56</td>\n",
       "      <td>10.8842</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>000429</td>\n",
       "      <td>粤高速A</td>\n",
       "      <td>1998-02-20</td>\n",
       "      <td>171.4</td>\n",
       "      <td>6.35</td>\n",
       "      <td>50</td>\n",
       "      <td>16.3350</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>000726</td>\n",
       "      <td>鲁泰A</td>\n",
       "      <td>2000-12-25</td>\n",
       "      <td>148.7</td>\n",
       "      <td>6.20</td>\n",
       "      <td>51</td>\n",
       "      <td>9.5082</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>000581</td>\n",
       "      <td>威孚高科</td>\n",
       "      <td>1998-09-24</td>\n",
       "      <td>145.7</td>\n",
       "      <td>5.60</td>\n",
       "      <td>50</td>\n",
       "      <td>28.5012</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       代码    名称        上市日期  累计股息(%)  年均股息(%)  分红次数  融资总额(亿)  融资次数\n",
       "0  000550  江铃汽车  1993-12-01    213.0     6.87    51   0.0000     0\n",
       "1  000541  佛山照明  1993-11-23    192.0     6.19    56  10.8842     1\n",
       "2  000429  粤高速A  1998-02-20    171.4     6.35    50  16.3350     1\n",
       "3  000726   鲁泰A  2000-12-25    148.7     6.20    51   9.5082     1\n",
       "4  000581  威孚高科  1998-09-24    145.7     5.60    50  28.5012     1"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def fill_zero(x):\n",
    "    x=str(x) #将x转换为字符型\n",
    "    xx=x.zfill(6)#通过0补全6位\n",
    "    return(xx)\n",
    "data[\"代码\"]=data[\"代码\"].apply(fill_zero)\n",
    "data.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "007b37c2",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:16:54.434867Z",
     "start_time": "2024-06-05T02:16:52.759263Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>代码</th>\n",
       "      <th>名称</th>\n",
       "      <th>上市日期</th>\n",
       "      <th>累计股息(%)</th>\n",
       "      <th>年均股息(%)</th>\n",
       "      <th>分红次数</th>\n",
       "      <th>融资总额(亿)</th>\n",
       "      <th>融资次数</th>\n",
       "      <th>年</th>\n",
       "      <th>月</th>\n",
       "      <th>日</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>000550</td>\n",
       "      <td>江铃汽车</td>\n",
       "      <td>1993-12-01</td>\n",
       "      <td>213.0</td>\n",
       "      <td>6.87</td>\n",
       "      <td>51</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>1993</td>\n",
       "      <td>12</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>000541</td>\n",
       "      <td>佛山照明</td>\n",
       "      <td>1993-11-23</td>\n",
       "      <td>192.0</td>\n",
       "      <td>6.19</td>\n",
       "      <td>56</td>\n",
       "      <td>10.8842</td>\n",
       "      <td>1</td>\n",
       "      <td>1993</td>\n",
       "      <td>11</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>000429</td>\n",
       "      <td>粤高速A</td>\n",
       "      <td>1998-02-20</td>\n",
       "      <td>171.4</td>\n",
       "      <td>6.35</td>\n",
       "      <td>50</td>\n",
       "      <td>16.3350</td>\n",
       "      <td>1</td>\n",
       "      <td>1998</td>\n",
       "      <td>2</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>000726</td>\n",
       "      <td>鲁泰A</td>\n",
       "      <td>2000-12-25</td>\n",
       "      <td>148.7</td>\n",
       "      <td>6.20</td>\n",
       "      <td>51</td>\n",
       "      <td>9.5082</td>\n",
       "      <td>1</td>\n",
       "      <td>2000</td>\n",
       "      <td>12</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>000581</td>\n",
       "      <td>威孚高科</td>\n",
       "      <td>1998-09-24</td>\n",
       "      <td>145.7</td>\n",
       "      <td>5.60</td>\n",
       "      <td>50</td>\n",
       "      <td>28.5012</td>\n",
       "      <td>1</td>\n",
       "      <td>1998</td>\n",
       "      <td>9</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       代码    名称        上市日期  累计股息(%)  年均股息(%)  分红次数  融资总额(亿)  融资次数     年   月  \\\n",
       "0  000550  江铃汽车  1993-12-01    213.0     6.87    51   0.0000     0  1993  12   \n",
       "1  000541  佛山照明  1993-11-23    192.0     6.19    56  10.8842     1  1993  11   \n",
       "2  000429  粤高速A  1998-02-20    171.4     6.35    50  16.3350     1  1998   2   \n",
       "3  000726   鲁泰A  2000-12-25    148.7     6.20    51   9.5082     1  2000  12   \n",
       "4  000581  威孚高科  1998-09-24    145.7     5.60    50  28.5012     1  1998   9   \n",
       "\n",
       "    日  \n",
       "0   1  \n",
       "1  23  \n",
       "2  20  \n",
       "3  25  \n",
       "4  24  "
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 将日期字符串转换为datetime对象\n",
    "date_series  =  data[\"上市日期\"].apply(pd.to_datetime)\n",
    "# 提取年月日\n",
    "data[\"年\"]=date_series.dt.year\n",
    "data[\"月\"]=date_series.dt.month\n",
    "data[\"日\"]=date_series.dt.day\n",
    "data.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "794182f1",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:16:54.458925Z",
     "start_time": "2024-06-05T02:16:54.436864Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>代码</th>\n",
       "      <th>名称</th>\n",
       "      <th>上市日期</th>\n",
       "      <th>累计股息(%)</th>\n",
       "      <th>年均股息(%)</th>\n",
       "      <th>分红次数</th>\n",
       "      <th>融资总额(亿)</th>\n",
       "      <th>融资次数</th>\n",
       "      <th>年</th>\n",
       "      <th>月</th>\n",
       "      <th>日</th>\n",
       "      <th>Z累计股息(%)</th>\n",
       "      <th>Z年均股息(%)</th>\n",
       "      <th>Z分红次数</th>\n",
       "      <th>Z融资总额(亿)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>000550</td>\n",
       "      <td>江铃汽车</td>\n",
       "      <td>1993-12-01</td>\n",
       "      <td>213.0</td>\n",
       "      <td>6.87</td>\n",
       "      <td>51</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0</td>\n",
       "      <td>1993</td>\n",
       "      <td>12</td>\n",
       "      <td>1</td>\n",
       "      <td>12.231068</td>\n",
       "      <td>6.303080</td>\n",
       "      <td>5.487792</td>\n",
       "      <td>-0.387413</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>000541</td>\n",
       "      <td>佛山照明</td>\n",
       "      <td>1993-11-23</td>\n",
       "      <td>192.0</td>\n",
       "      <td>6.19</td>\n",
       "      <td>56</td>\n",
       "      <td>10.8842</td>\n",
       "      <td>1</td>\n",
       "      <td>1993</td>\n",
       "      <td>11</td>\n",
       "      <td>23</td>\n",
       "      <td>10.936265</td>\n",
       "      <td>5.553305</td>\n",
       "      <td>6.168622</td>\n",
       "      <td>-0.224371</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>000429</td>\n",
       "      <td>粤高速A</td>\n",
       "      <td>1998-02-20</td>\n",
       "      <td>171.4</td>\n",
       "      <td>6.35</td>\n",
       "      <td>50</td>\n",
       "      <td>16.3350</td>\n",
       "      <td>1</td>\n",
       "      <td>1998</td>\n",
       "      <td>2</td>\n",
       "      <td>20</td>\n",
       "      <td>9.666125</td>\n",
       "      <td>5.729723</td>\n",
       "      <td>5.351627</td>\n",
       "      <td>-0.142719</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>000726</td>\n",
       "      <td>鲁泰A</td>\n",
       "      <td>2000-12-25</td>\n",
       "      <td>148.7</td>\n",
       "      <td>6.20</td>\n",
       "      <td>51</td>\n",
       "      <td>9.5082</td>\n",
       "      <td>1</td>\n",
       "      <td>2000</td>\n",
       "      <td>12</td>\n",
       "      <td>25</td>\n",
       "      <td>8.266504</td>\n",
       "      <td>5.564331</td>\n",
       "      <td>5.487792</td>\n",
       "      <td>-0.244983</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>000581</td>\n",
       "      <td>威孚高科</td>\n",
       "      <td>1998-09-24</td>\n",
       "      <td>145.7</td>\n",
       "      <td>5.60</td>\n",
       "      <td>50</td>\n",
       "      <td>28.5012</td>\n",
       "      <td>1</td>\n",
       "      <td>1998</td>\n",
       "      <td>9</td>\n",
       "      <td>24</td>\n",
       "      <td>8.081532</td>\n",
       "      <td>4.902765</td>\n",
       "      <td>5.351627</td>\n",
       "      <td>0.039527</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       代码    名称        上市日期  累计股息(%)  年均股息(%)  分红次数  融资总额(亿)  融资次数     年   月  \\\n",
       "0  000550  江铃汽车  1993-12-01    213.0     6.87    51   0.0000     0  1993  12   \n",
       "1  000541  佛山照明  1993-11-23    192.0     6.19    56  10.8842     1  1993  11   \n",
       "2  000429  粤高速A  1998-02-20    171.4     6.35    50  16.3350     1  1998   2   \n",
       "3  000726   鲁泰A  2000-12-25    148.7     6.20    51   9.5082     1  2000  12   \n",
       "4  000581  威孚高科  1998-09-24    145.7     5.60    50  28.5012     1  1998   9   \n",
       "\n",
       "    日   Z累计股息(%)  Z年均股息(%)     Z分红次数  Z融资总额(亿)  \n",
       "0   1  12.231068  6.303080  5.487792 -0.387413  \n",
       "1  23  10.936265  5.553305  6.168622 -0.224371  \n",
       "2  20   9.666125  5.729723  5.351627 -0.142719  \n",
       "3  25   8.266504  5.564331  5.487792 -0.244983  \n",
       "4  24   8.081532  4.902765  5.351627  0.039527  "
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "X_scaler= StandardScaler()#创建一个StandardScaler对象\n",
    "X_scaled=X_scaler.fit_transform(data[[\"累计股息(%)\",\"年均股息(%)\",\"分红次数\",\"融资总额(亿)\"]])\n",
    "data_scaled=pd.DataFrame(X_scaled,columns=[\"Z累计股息(%)\",\"Z年均股息(%)\",\"Z分红次数\",\"Z融资总额(亿)\"])\n",
    "data_new=pd.concat([data,data_scaled],axis=1)\n",
    "data_new.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "7b052209",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T02:16:56.004004Z",
     "start_time": "2024-06-05T02:16:54.462916Z"
    }
   },
   "outputs": [],
   "source": [
    "data_new.to_excel(\"新浪财经历史分红数据_预处理.xlsx\",index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2cd580a1",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abea55d2",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "197.2px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
